Data import

If the RDS objects have already been created in a previous run, the following cell can be uncommented and run, and the remaining cells in the data import section can be skipped. See RDS object section).

# data_list <- list.files(path = here("results-refseq/r_objects"),
#                         pattern = ".RDS",
#                         recursive = TRUE, full.names = TRUE) %>%
#   set_names(., str_remove(basename(.), ".RDS")) %>%
#   purrr::imap(~ readRDS(file = .x ))
#
# list2env(data_list, .GlobalEnv)
# rm(data_list)

Read in fastq-screen and mapping statistics

# read in fastq screen files
fastq_screen <- list.files(
  path = here("results-refseq/fastq-screen"), pattern = "_screen.txt",
  recursive = TRUE, full.names = TRUE
) %>%
  read_delim(skip = 1, delim = "\t", n_max = 2, id = "read", show_col_types = FALSE) %>%
  rename("#Multiple_hits_multiple_genomes" = Multiple_hits_multiple_genomes) %>%
  mutate(read = str_remove(basename(read), "_screen.txt")) %>%
  mutate(sample = as.factor(str_remove(read, "_R[1,2].*"))) %>%
  mutate(Genome = as.factor(Genome))

# combine R1 and R2 stats per species and recalculate percentages
fastq_screen <- fastq_screen %>%
  select(!starts_with("%")) %>%
  group_by(sample, Genome) %>%
  summarise(across(.cols = `#Reads_processed`:`#Multiple_hits_multiple_genomes`, .fns = ~ sum(.x)), .groups = "drop_last") %>%
  mutate(across(.cols = `#Unmapped`:`#Multiple_hits_multiple_genomes`, .fns = ~ .x / `#Reads_processed`, .names = "%_{.col}")) %>%
  rename_with(~ str_replace(.x, "#", ""), .cols = starts_with("%"))

# read in samtools flagstat and metadata files
mapping_stats_df <- read_delim((here("results-refseq/mapping_stats.csv")), col_types = list(sample = "factor")) %>%
  mutate(primary_mapped_human = primary_mapped_all - primary_mapped_pk)

# merge dataframes
fastq_screen_mapping_stats_df <- mapping_stats_df %>%
  left_join(fastq_screen, by = "sample")

# Picard PCR duplicates
pcr_dups <- read_delim(here("results-refseq/multiqc/star_salmon/multiqc_report_data/multiqc_picard_dups.txt"), delim = "\t", col_types = list(Sample = "factor"))

# STAR mapping statistics
star_stats <- read_delim(here("results-refseq/multiqc/star_salmon/multiqc_report_data/mqc_star_alignment_plot_1.txt"), delim = "\t", col_types = list(Sample = "factor"))

Read in salmon gene counts and tximport gene counts/TPM abundances

Import the following sets of count files:

  • Gene-level counts (created in nf-core/rnaseq pipeline through tximport of Salmon transcript-level counts )
  • Gene-level TPM (created in nf-core/rnaseq pipeline through tximport of Salmon transcript-level counts )
  • Gene-level counts inside DESeq/SummarizedExperiment object (created by manually importing Salmon quant.sf files via tximport. Values should correspond exactly to the other gene-level counts). Used for visualisation.

Background info on the different types of counts and how to import them:

salmon outputs two types of counts: transcript-level (quant.sf) and gene-level (quant.genes.sf). Both offer feature length, effective length, TPM, and number of reads. However, when importing counts with tximport for downstream differential expression analysis, transcript-level is preferred, because tximport can perform gene-level aggregation across all samples, instead of the per-sample aggregation that is done for the quant.genes.sf files. See: https://github.com/COMBINE-lab/salmon/issues/437.

For tximport, there are several different ways of creating count matrices (i.e., using raw counts with length offset or by scaling the TPM abundances), but this should only be relevant for downstream differential expression testing) See the description of different options provided by the nf-core/rnaseq pipeline: https://nf-co.re/rnaseq/3.14.0/docs/output/#pseudoalignment and the tximport docs here: https://bioconductor.org/packages/release/bioc/vignettes/tximport/inst/doc/tximport.html.

The Salmon transcript-level quant files can be imported using tximport and then converted into a DESeq object (~SummarizedExperiment). This approach provides the same count matrix as the tximport gene counts csv file, but it enables the DESeq2 approach for visualisation, namely transforming the counts using vst (variance stabilizing transformation) or rlog in order to remove the dependence of the variance on the mean. See DESeq2 vignette.

# read in sample metadata
samples <- read_csv(here("./data/samplesheet.csv"),
  col_select = c("sample", "condition", "RNA_type", "library_kit", "removal", "WBC_depletion", "kit"),
  col_types = cols(.default = "f")
) %>%
  mutate(RNA_type = str_to_title(RNA_type)) %>%
  mutate(filter_name = factor(WBC_depletion,
    levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"),
    labels = c("no filter", "cent/pip", "plasmodipur", "PMACS", "cellulose")
  )) %>%
  # mutate(kit = as.factor(str_replace_all(kit, "_", " "))) %>%
  mutate(kit_name = factor(kit,
    levels = c("mRNA_qiagen_FSglob", "mRNA_qiagen_FSglobH", "tRNA_qiagen_FSglobH", "mRNA_illumina"),
    labels = c("mRNA Qiagen FS globin", "mRNA Qiagen FS globin/rRNA", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina")
  )) %>%
  mutate(sample_name = as.factor(str_replace(str_replace(sample, "104974-001-0", "Sample "), "_S1_L001", paste(" -", kit_name, "-", filter_name)))) %>%
  mutate(sample_short = as.factor(str_replace(str_replace(sample, "104974-001-0", "Sample "), "_S1_L001", "")))

# read in transcript info - transcript id, gene id and genes names
transcript_info <- read.csv(here("results-refseq/star_salmon/tx2gene.tsv"),
  sep = "\t", header = FALSE,
  col.names = c("tx", "gene_id", "gene_name")
)
transcript_info <- list(
  transcript = transcript_info,
  gene = unique(transcript_info[, 2:3]),
  tx2gene = transcript_info[, 1:2]
)

# txi gene-level counts - same as using tximport and DESeqDataSetFromTximport except rounding needs to be done manually
# gene-level aggregation across all samples
gene_counts_txi <- read_delim(here("results-refseq/star_salmon/salmon.merged.gene_counts.tsv")) %>%
  mutate(across(starts_with("X"), \(x) as.integer(round(x)))) %>%
  rename_with(~ str_replace_all(.x, pattern = "\\.", replacement = "-"), .cols = `X104974.001.001_S1_L001`:`X104974.001.016_S1_L001`) %>%
  rename_with(~ str_sub(.x, 2, -1), .cols = `X104974-001-001_S1_L001`:`X104974-001-016_S1_L001`)

# txi gene-level TPM abundances
gene_tpm <- read_delim(here("results-refseq/star_salmon/salmon.merged.gene_tpm.tsv")) %>%
  rename_with(~ str_replace_all(.x, pattern = "\\.", replacement = "-"), .cols = `X104974.001.001_S1_L001`:`X104974.001.016_S1_L001`) %>%
  rename_with(~ str_sub(.x, 2, -1), .cols = `X104974-001-001_S1_L001`:`X104974-001-016_S1_L001`)

# DESeqDataSetFromTximport - original counts + offset tximport
# to be used with DESeq2's vst/rlog transform for visualisations
salmon_quant_files <- paste0("./results-refseq/star_salmon/", list.files(path = "./results-refseq/star_salmon/", pattern = "quant.sf", recursive = TRUE))
txi <- tximport(salmon_quant_files, type = "salmon", tx2gene = transcript_info$tx2gene, txOut = FALSE)
gse <- DESeqDataSetFromTximport(txi, colData = samples, design = ~1)

# clean up
rm(txi)

Collect gene annotation

  • Add species information to genes by gathering list of P. knowlesi genes from gff file.
  • Add gene_biotype info for P. knowlesi genes (protein coding and r/t/nc/sn/snoRNA).
  • Add globin and rRNA gene annotation for both species.
# Note: not all Pk genes start with the prefix PKNH, so it is safer to read genes from GFF instead
gff <- read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff.gz"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
  # only keep genes, since we use aggregated gene counts
  filter(feature == "gene") %>%
  mutate(id = row_number()) %>%
  separate_rows(attribute, sep = ";") %>%
  separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
  pivot_wider(names_from = attribute, values_from = attribute_value) %>%
  select(c(gene_biotype, ID)) %>%
  rename(gene_id = ID) %>%
  mutate(species = "pk")

# check to make sure there are no genes in gff that do not appear in transcript info file
stopifnot(all(gff$gene_id %in% transcript_info$gene$gene_id))

# create gene annotation df (based on one of the count matrices, not tx2gene
# because the latter contains more features, namely pseudogenes)
gene_annotation <- gene_counts_txi %>%
  select(c(gene_id, gene_name)) %>%
  left_join(gff, by = "gene_id") %>%
  mutate(species = case_when(is.na(species) ~ "human", .default = species))

# check to make sure all genes were assigned a species
stopifnot(!any(is.na(gene_annotation$species)))

# naive check to make sure "most" Pk genes are assigned the correct species
stopifnot((gene_annotation %>% filter(str_detect(gene_id, "PKNH")) %>% select(species) %>% unique()) == "pk")

Annotate rRNA and globin genes

rRNA

The RefSeq gff files contains info on rRNA in the gene_biotype attribute of gene entries (including mitochondrial rRNA) for both species (i.e., we collected featured labelled as gene in the 3rd column of the gff, not rRNA directly, since counts were aggregated on the gene level).

NC_011902.2 RefSeq  gene    825471  825594  .   +   .   ID=gene-PKNH_0117700;Dbxref=GeneID:7318405;Name=PKNH_0117700;end_range=825594,.;gbkey=Gene;gene_biotype=rRNA;locus_tag=PKNH_0117700;old_locus_tag=PKH_011712;partial=true;start_range=.,825471

NC_011902.2 RefSeq  rRNA    825471  825594  .   +   .   ID=rna-XR_002461722.1;Parent=gene-PKNH_0117700;Dbxref=GeneID:7318405,GenBank:XR_002461722.1;Name=XR_002461722.1;end_range=825594,.;gbkey=rRNA;locus_tag=PKNH_0117700;orig_transcript_id=gnl|WGS:CADCXE|mrna.PKNH_0117700-RA;partial=true;product=5.8S ribosomal RNA;start_range=.,825471;transcript_id=XR_002461722.1

NC_011902.2 RefSeq  exon    825471  825594  .   +   .   ID=exon-XR_002461722.1-1;Parent=rna-XR_002461722.1;Dbxref=GeneID:7318405,GenBank:XR_002461722.1;end_range=825594,.;gbkey=rRNA;locus_tag=PKNH_0117700;orig_transcript_id=gnl|WGS:CADCXE|mrna.PKNH_0117700-RA;partial=true;product=5.8S ribosomal RNA;start_range=.,825471;transcript_id=XR_002461722.1

The bash script in scripts/2_expression_analysis/0_gene_annotations-refseq.sh was used to retrieve the rRNA genes from both annotation files.

# zcat "${ref_human}" | awk '$3 ~ /gene/' | cut -f9 | grep -E "gene_biotype=rRNA" | grep -v "rRNA_pseudo" | grep -o "ID=[^;]*" | sed 's/ID=/human_rRNA;/' >> "${output_file}"
# zcat "${ref_pk}" | cut -f9 | grep -E "gene_biotype=rRNA" | grep -o "ID=[^;]*" | sed 's/ID=/pk_rRNA;/' >> "${output_file}"
# read in gene type table
gene_types_df <- read_csv2(here("data/ref-refseq-refseq/gene_types.csv")) %>% left_join(transcript_info$gene, by = "gene_id")

Note that the GENCODE and RefSeq annotation file contain different subsets of annotated rRNA genes! See semi-comprehensive list of human rRNA genes here: https://www.genenames.org/data/genegroup/#!/group/848. Some are not included in any reference assembly, but others appear to be exclusive to either GENCODE or RefSeq. Some examples include RNA18S and 28S being present in RefSeq only. Moreover, for GENCODE, the majority of rRNA counts appear to be zero across the board.

See tables below for a full breakdown:

(click to expand)
knitr::kable(
  gene_types_df %>%
    filter((gene_type == "human_rRNA") | (gene_type == "Mt_rRNA")) %>%
    left_join(gene_tpm, by = c("gene_id", "gene_name")) %>%
    filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0),
  caption = "Non-zero counts of human rRNA genes annotated in RefSeq"
)
Non-zero counts of human rRNA genes annotated in RefSeq
gene_type gene_id gene_name 104974-001-001_S1_L001 104974-001-002_S1_L001 104974-001-003_S1_L001 104974-001-004_S1_L001 104974-001-005_S1_L001 104974-001-006_S1_L001 104974-001-007_S1_L001 104974-001-008_S1_L001 104974-001-009_S1_L001 104974-001-010_S1_L001 104974-001-011_S1_L001 104974-001-012_S1_L001 104974-001-013_S1_L001 104974-001-014_S1_L001 104974-001-015_S1_L001 104974-001-016_S1_L001
human_rRNA gene-RNA5S1 RNA5S1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S2 RNA5S2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S3 RNA5S3 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S4 RNA5S4 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S5 RNA5S5 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S6 RNA5S6 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S7 RNA5S7 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S8 RNA5S8 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S9 RNA5S9 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 1.91644 0.000000 0.00000 0.000000 0.000000 0.000000 1.709899 0.500011 1.224046
human_rRNA gene-RNA5S10 RNA5S10 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S11 RNA5S11 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S12 RNA5S12 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 4.896184
human_rRNA gene-RNA5S13 RNA5S13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S14 RNA5S14 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 2.564849 0.156253 0.000000
human_rRNA gene-RNA5S15 RNA5S15 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S16 RNA5S16 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 3.342939 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA5S17 RNA5S17 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.119986 0.000000 0.156253 0.000000
human_rRNA gene-RNA45SN2 RNA45SN2 4.751703 2.105310 4.090869 2.859081 0.000000 7.056017 1532.79602 920.02587 193.091071 187.12187 23.858285 27.716852 11.439136 5.365124 7.821932 6.431999
human_rRNA gene-RNA18SN2 RNA18SN2 62.342454 107.701193 200.195536 0.000000 296.675752 0.000000 0.00000 0.00000 220.510714 374.09125 351.102665 1677.113938 1064.578070 1535.906680 1762.266899 2096.663452
human_rRNA gene-RNA5-8SN2 RNA5-8SN2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 138.875585
human_rRNA gene-RNA28SN2 RNA28SN2 190.946875 250.720809 201.681762 242.757269 219.348795 0.000000 0.00000 36.77702 1272.445684 1111.53791 1185.237687 1875.808091 1060.493248 1296.851623 1615.471929 1534.355232
human_rRNA gene-RNA45SN3 RNA45SN3 0.997892 0.000000 0.000000 0.000000 0.268391 0.000000 139.91191 91.09562 33.129508 29.48614 4.590309 1.549567 0.924119 0.393336 0.000000 0.000000
human_rRNA gene-RNA18SN3 RNA18SN3 62.012092 105.547238 196.740978 0.000000 295.746390 0.000000 0.00000 0.00000 219.883500 374.07612 347.512255 1671.307115 1078.803228 1542.499676 1753.969147 2086.600386
human_rRNA gene-RNA5-8SN3 RNA5-8SN3 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 58.912857
human_rRNA gene-RNA28SN3 RNA28SN3 16.865757 25.000134 29.234420 22.112708 29.729779 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 23.257654 17.733035 15.199022 20.575958 18.046203
human_rRNA gene-RNA45SN1 RNA45SN1 6.039829 0.000000 0.000000 0.000000 0.000000 6.019315 604.00017 368.80178 76.638722 68.36348 7.017963 14.415548 5.925401 2.351978 1.894936 2.098510
human_rRNA gene-RNA5-8SN1 RNA5-8SN1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 31.234607
human_rRNA gene-RNA28SN1 RNA28SN1 67.143854 82.323209 103.902309 82.308384 119.013023 0.000000 795.14549 1132.53586 1222.286618 1449.88852 1549.155050 0.000000 0.000000 0.000000 0.000000 0.000000
human_rRNA gene-RNA45SN4 RNA45SN4 7.349364 5.157419 5.628581 5.095599 1.426797 10.922209 1756.19524 1065.05597 251.309378 236.85418 32.406743 52.335264 23.963442 9.690368 8.997157 6.814541
human_rRNA gene-RNA18SN4 RNA18SN4 58.828665 111.066179 205.627316 0.000000 293.319650 0.000000 0.00000 0.00000 219.759981 370.27779 348.278460 1666.244320 1079.004751 1542.588846 1756.895843 2073.957543
human_rRNA gene-RNA5-8SN4 RNA5-8SN4 0.000000 0.000000 0.000000 0.000000 32.860697 36.502763 0.00000 0.00000 0.000000 0.00000 0.000000 748.116995 523.601567 636.022692 563.180219 525.143478
human_rRNA gene-RNA28SN4 RNA28SN4 488.534706 638.796031 567.890117 603.969644 619.882570 0.000000 1314.76873 1956.17536 3199.299038 3993.27833 4511.897218 4190.916438 2457.827433 2536.060195 3411.269658 3033.838207
human_rRNA gene-RNA45SN5 RNA45SN5 0.000000 0.194935 0.000000 0.689502 0.187359 0.000000 11.11903 11.74383 8.504285 10.58738 5.416470 0.181703 0.000000 0.000000 0.000000 0.000000
human_rRNA gene-RNA28SN5 RNA28SN5 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 23.041655 0.000000 0.000000 0.000000 0.000000 0.000000
human_rRNA gene-RNA5S17-2 RNA5S17 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 7.800192 14.398334 14.534145 5.500117 11.832444
human_rRNA gene-RNA45SN3-2 RNA45SN3 0.000000 3.504348 0.579441 0.561736 0.000000 1.190227 171.05619 113.23896 34.307079 34.74936 9.053317 0.392968 0.000000 0.501621 0.943800 0.933711
human_rRNA gene-RNA18SN3-2 RNA18SN3 54.242554 114.466476 213.094452 760.232923 294.189391 0.000000 0.00000 0.00000 220.421934 367.76918 349.563430 1649.049824 1081.121546 1543.470708 1750.138192 2066.106213
human_rRNA gene-RNA28SN3-2 RNA28SN3 15.566932 29.297394 30.742063 25.074316 28.841534 0.000000 0.00000 0.00000 0.000000 0.00000 0.000000 23.646839 17.483903 15.206186 20.562627 21.406914
human_rRNA gene-RNA45SN1-2 RNA45SN1 5.973266 8.866557 3.016118 4.251982 0.589623 0.000000 731.71393 435.74070 88.820605 77.82077 8.493107 14.506503 5.872081 2.395843 5.115513 2.191921
human_rRNA gene-RNA28SN1-2 RNA28SN1 69.232038 108.436101 102.541791 81.803956 122.395534 0.000000 994.92817 1309.04810 1320.097300 1613.90704 1654.843682 0.000000 0.000000 0.000000 0.000000 0.000000
human_rRNA gene-RNR1 RNR1 14834.315751 16214.239057 6908.575576 3168.095844 8443.443125 10.732938 33.19660 30.11688 13.739731 29.93136 4.488337 3549.195879 3265.529789 462.500954 260.213996 337.772158
human_rRNA gene-RNR2 RNR2 27104.637436 29430.692094 19293.910206 9832.230123 26033.024421 19.394133 65.09365 67.31826 30.271294 30.05422 21.338831 5784.927065 5240.821956 1166.936234 768.801099 990.511073
# check if GENCODE results are also present in directory
if (file.exists(here("data/ref/gene_types.csv"))) {
  transcript_info_gencode <- read.csv(here("results/star_salmon/tx2gene.tsv"),
    sep = "\t", header = FALSE,
    col.names = c("tx", "gene_id", "gene_name")
  )
  transcript_info_gencode <- list(
    transcript = transcript_info_gencode,
    gene = unique(transcript_info_gencode[, 2:3]),
    tx2gene = transcript_info_gencode[, 1:2]
  )
  gene_types_gencode_df <- read_csv2(here("data/ref/gene_types.csv")) %>%
    left_join(transcript_info_gencode$gene, by = "gene_id")

  gene_tpm_gencode <- read_delim(here("results/star_salmon/salmon.merged.gene_tpm.tsv"))

  knitr::kable(
    gene_types_gencode_df %>%
      filter((gene_type == "human_rRNA") | (gene_type == "Mt_rRNA")) %>%
      left_join(gene_tpm_gencode, by = c("gene_id", "gene_name")) %>%
      filter(rowSums(across(starts_with("X"))) != 0),
    caption = "Non-zero counts of human rRNA genes annotated in GENCODE"
  )
}
Non-zero counts of human rRNA genes annotated in GENCODE
gene_type gene_id gene_name X104974.001.001_S1_L001 X104974.001.002_S1_L001 X104974.001.003_S1_L001 X104974.001.004_S1_L001 X104974.001.005_S1_L001 X104974.001.006_S1_L001 X104974.001.007_S1_L001 X104974.001.008_S1_L001 X104974.001.009_S1_L001 X104974.001.010_S1_L001 X104974.001.011_S1_L001 X104974.001.012_S1_L001 X104974.001.013_S1_L001 X104974.001.014_S1_L001 X104974.001.015_S1_L001 X104974.001.016_S1_L001
human_rRNA ENSG00000199352.1 RNA5S1 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 0.00000 63.88345 0.000000 0.000000 0.000000
human_rRNA ENSG00000201588.1 RNA5S2 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 37.62973 0.00000 0.000000 0.000000 0.000000
human_rRNA ENSG00000200624.1 RNA5S6 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 0.00000 0.00000 49.459879 0.000000 0.000000
human_rRNA ENSG00000201321.1 RNA5S9 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 4.530894 0.0000 0.000 0.00000 0.00000 0.00000 1.764045 0.564701 1.347898
human_rRNA ENSG00000202526.1 RNA5S13 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 0.00000 0.00000 0.000000 0.000000 37.116281
human_rRNA ENSG00000201355.1 RNA5S14 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 0.00000 0.00000 0.000000 48.057305 39.899804
human_rRNA ENSG00000199839.1 RNA5SP150 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 0.00000 0.00000 0.000000 0.000000 0.396039
human_rRNA ENSG00000283274.1 5_8S_rRNA 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 0.00000 0.00000 0.000000 0.000000 0.214992
human_rRNA ENSG00000283291.1 5_8S_rRNA 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 0.00000 0.00000 0.000000 0.000000 1.289955
human_rRNA ENSG00000278233.1 RNA5-8SN2 0.00000 16.6997 0.00000 0 0.00000 0.00000 76.32146 0.000000 0.0000 0.000 82.78823 119.82333 180.19770 71.675414 54.502627 91.872277
human_rRNA ENSG00000277739.1 5_8S_rRNA 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 80.526347 0.0000 0.000 0.00000 101.55505 90.56981 69.776516 58.522634 91.386372
human_rRNA ENSG00000275215.1 RNA5-8SN3 11.15067 0.0000 18.24354 0 0.00000 0.00000 0.00000 0.000000 94.8767 0.000 0.00000 109.06286 88.00438 73.524728 62.021795 91.480219
human_rRNA ENSG00000278189.1 RNA5-8SN1 0.00000 0.0000 0.00000 0 38.72457 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 104.88716 43.51271 72.784121 73.098666 92.466613
human_rRNA ENSG00000276871.1 5_8S_rRNA 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 0.00000 0.00000 0.000000 0.000000 0.214992
human_rRNA ENSG00000274917.1 RNA5-8SN5 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 105.47408 28.43676 78.916641 77.600476 91.012288
human_rRNA ENSG00000273730.1 5_8S_rRNA 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 94.87708 34.08167 80.865489 82.939565 91.298725
human_rRNA ENSG00000278294.1 5_8S_rRNA 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 0.000 0.00000 0.00000 0.00000 0.000000 0.000000 1.289955
human_rRNA ENSG00000276700.1 RNA5-8SN4 0.00000 0.0000 0.00000 0 0.00000 0.00000 0.00000 0.000000 0.0000 102.071 0.00000 86.60765 36.65530 80.646396 79.449934 90.904625
human_rRNA ENSG00000275757.1 5_8S_rRNA 0.00000 0.0000 0.00000 0 0.00000 66.87828 0.00000 0.000000 0.0000 0.000 0.00000 84.14490 44.42230 81.411437 80.039742 91.985737
gene_types_df %>%
  left_join(gene_tpm, by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
  pivot_longer(
    cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts, fill = sample_short)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = "Sample", title = "RefSeq annotated rRNA - TPM") +
  ylab(label = "TPM") +
  xlab(label = "Gene name") +
  facet_wrap(~kit_name) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))

gene_types_gencode_df %>%
  left_join(gene_tpm_gencode, by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(starts_with("X"))) != 0) %>%
  pivot_longer(
    cols = `X104974.001.001_S1_L001`:`X104974.001.016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  mutate(sample = str_replace_all(sample, "\\.", replacement = "-")) %>%
  mutate(sample = str_sub(sample, 2, -1)) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts, fill = sample_short)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = "Sample", title = "GENCODE annotated rRNA - TPM") +
  ylab(label = "TPM") +
  xlab(label = "Gene name") +
  facet_wrap(~kit_name) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))

Globins

For human globins, the following genes were included: “HBB”,“HBD”,“HBBP1”,“HBG1”,“HBG2”,“HBE1”,“HBZ”,“HBM”,“HBA2”,“HBA1”,“HBQ1”.

globin_names <- c("HBB", "HBD", "HBBP1", "HBG1", "HBG2", "HBE1", "HBZ", "HBM", "HBA2", "HBA1", "HBQ1")
globin_df <- tibble(gene_name = globin_names, gene_type = "human_globin") %>% left_join(transcript_info$gene, by = "gene_name")

gene_types_df <- gene_types_df %>% bind_rows(globin_df)

# export full gene type file
write_csv2(gene_types_df, here("data/ref-refseq-refseq/gene_types_R.csv"))

# some gene id share the same gene name
# gene_types_df %>%
#     group_by(gene_name) %>%
#     filter(n()>1)

Merge globin and rRNA annotations with gene annotation dataframe:

gene_annotation <- gene_annotation %>%
  left_join(gene_types_df, by = c("gene_id", "gene_name")) %>%
  mutate("gene_type" = coalesce(gene_type, species))

Add annotations to counts

# merge gene_type annotation and fill in missing types using species name
annotate_gene_type <- function(data, gene_annotation) {
  data %>%
    left_join(gene_annotation, by = c("gene_id", "gene_name"))
}

gene_counts_txi <- annotate_gene_type(gene_counts_txi, gene_annotation)
gene_tpm <- annotate_gene_type(gene_tpm, gene_annotation)

# for SummarizedExperiment DESeq2 object, add annotations to rowData
rowData(gse) <- gene_annotation

# ensure genes are in the same order in annotation data and count files
# stopifnot(all(rownames(txi$abundance) == gene_annotation$gene_id))
stopifnot(all(rownames(gse) == gene_annotation$gene_id))

Subset DESeq object on Pk genes

gse_pk <- gse[rowData(gse)$species == "pk", ]
# or
# pk_genes <- gene_annotation %>% filter(species == "pk") %>% select(gene_id) %>% pull()
# gse_pk <- gse[pk_genes]

Filtering out low count genes

The raw counts can optionally be pre-filtered to remove rows that carry little to no information on gene expression levels. The RNA-seq tutorial by Love et. al 2019 and the DESeq2 vignette recommended a minimum count of 10 reads for a minimal number of samples (e.g., the size of the smallest group). Since we do not have replicates for the different treatment groups, we instead opt for a minimum of 10 reads for at least a single sample, as a conservative filtering approach (e.g., only genes for which none of the samples have a higher read count than 10 are removed).

# create filter for all genes on the level of the DESeq/tximport gene counts
filter_all_genes <- rowSums(counts(gse) >= 10) >= 1
# should be the same as filtering on tximport raw gene counts from nf-core pipeline
stopifnot(all(filter_all_genes == (rowSums(gene_counts_txi[, 3:18] >= 10) >= 1)))

# create filtered versions of deseq2 objects: all genes, pk genes, pk protein-coding
gse_filtered <- gse[filter_all_genes, ]
gse_pk_filtered <- gse_filtered[rowData(gse_filtered)$species == "pk", ]
gse_pk_filtered_coding <- gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding", ]

# create filtered versions of other count matrices
gene_tpm_filtered <- gene_tpm[filter_all_genes, ]
gene_counts_txi_filtered <- gene_counts_txi[filter_all_genes, ]

# check if filter was applied correctly, i.e. order of genes is the same in gse and tpm matrix
stopifnot(all.equal(gene_tpm_filtered, gene_tpm[(rowSums(gene_counts_txi[, 3:18] >= 10) >= 1), ]))
stopifnot(gene_tpm$gene_id == rownames(gse))
stopifnot(gene_tpm_filtered$gene_id == rownames(gse_filtered))
stopifnot(all.equal(gene_counts_txi_filtered, gene_counts_txi[(rowSums(gene_counts_txi[, 3:18] >= 10) >= 1), ]))
stopifnot(gene_counts_txi$gene_id == rownames(gse))
stopifnot(gene_counts_txi$gene_id == gene_tpm$gene_id)
stopifnot(gene_counts_txi_filtered$gene_id == rownames(gse_filtered))

The number of genes were filtered down from 55743 to 27955 and from 5452 to 5147, for all genes and P. knowlesi genes respectively. So in conclusion, the effect is relatively minor for P. knowlesi, but a substantial number of human genes were removed.

It is difficult to determine an appropriate threshold for filtering on the TPM level, so instead we filter the same genes as on the raw count level (>=10 for at least 1 sample).

dim(gene_tpm)
## [1] 55743    21
mean.tpm <- apply(gene_tpm[, 3:18], 1, mean)
hist(log10(mean.tpm), main = "TPM before filtering")

# hist(log10(mean.tpm+1e-5))

dim(gene_tpm_filtered)
## [1] 27955    21
mean.tpm <- apply(gene_tpm_filtered[, 3:18], 1, mean)
hist(log10(mean.tpm), main = "TPM after filtering")

Transform dataframes to long format

# cast to long format
to_long <- function(data) {
  data %>%
    pivot_longer(cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, names_to = "sample", values_to = "counts")
}

gene_counts_txi_long <- to_long(gene_counts_txi)
gene_counts_txi_filtered_long <- to_long(gene_counts_txi_filtered)

gene_tpm_long <- to_long(gene_tpm)
gene_tpm_filtered_long <- to_long(gene_tpm_filtered)

Transform counts for visualisation

For visualisation purposes (e.g., PCA or heatmaps), gene counts should be transformed so that they all have the same range of variances, i.e. homoskedastic, instead of the variance depending on the mean. DESeq2 offers two transform options to stabilize the variance, vst and rlog. According to Love et al., 2019, rlog, while slower, works better for small datasets (n < 30) and when there are large differences in sequencing depth across samples.

We use blind=TRUE to obtain a fully unsupervised transformation (independent of the design, although we are using an intercept-only model anyway), generally intended to observe batch effects. However, since we can have large differences in counts between samples due to our experimental design (e.g., zero counts for rRNA or globins in some samples), the transformation might not work as well as intended and shrink values too much (DESeq2 vignette).

Also note that attempting to transform unfiltered counts can lead to strange outcomes.

Lastly, the transformation should ideally be performed once on the entire set of gene counts (after filtering out low counts), and only then subset on different gene groups of interest (e.g., species, protein-coding genes only, etc.), rather than re-applying the transformation to a subset of counts. However, in actual experiments focused on P. knowlesi, researchers will also throw out human gene counts (e.g., by not even mapping to the human reference or simply discarding those reads). In that sense, evaluating the results by transforming only the genes of interest can be considered the better approach.

# # variance stabilizing transformation without filtering
# vsd <- vst(gse, blind = TRUE)
# p <- meanSdPlot(assay(vsd))
# p$gg + labs(title = "VST on unfiltered counts")

# extract transformed gene counts
# gene_counts_vst <- data.frame(assays(vsd)[[1]])
# colnames(gene_counts_vst) <- colData(gse)$sample
# gene_counts_vst <- gene_counts_vst %>%
#   mutate(gene_id = rownames(gene_counts_vst)) %>%
#   left_join(transcript_info$gene, by = "gene_id")

# variance stabilizing transform on filtered object
vsd_filtered <- vst(gse_filtered, blind = TRUE)
p <- meanSdPlot(assay(vsd_filtered))

p$gg + labs(title = "VST on filtered counts")

# compare with rlog transform on filtered object
rld_filtered <- rlog(gse_filtered, blind = TRUE)
p <- meanSdPlot(assay(rld_filtered))

p$gg + labs(title = "Rlog on filtered counts")

# transform after subsetting on species/protein-coding genes
# vsd_pk <- vst(gse_pk, blind = TRUE)
# p <- meanSdPlot(assay(vsd_pk))
# p$gg + labs(title = "VST on unfiltered counts after subsetting Pk genes")

vsd_pk_filtered <- vst(gse_pk_filtered, blind = TRUE)
p <- meanSdPlot(assay(vsd_pk_filtered))

p$gg + labs(title = "VST on filtered counts after subsetting Pk genes")

rld_pk_filtered <- rlog(gse_pk_filtered, blind = TRUE)
p <- meanSdPlot(assay(rld_pk_filtered))

p$gg + labs(title = "Rlog on filtered counts after subsetting Pk genes")

vsd_pk_filtered_coding <- vst(gse_pk_filtered_coding, blind = TRUE)
p <- meanSdPlot(assay(vsd_pk_filtered))

p$gg + labs(title = "VST on filtered counts after subsetting Pk protein-coding genes")

rld_pk_filtered_coding <- rlog(gse_pk_filtered_coding, blind = TRUE)
p <- meanSdPlot(assay(rld_pk_filtered))

p$gg + labs(title = "Rlog on filtered counts after subsetting Pk protein-coding genes")

# extract transformed gene counts
# gene_counts_vst_pk <- data.frame(assays(vsd_pk)[[1]])
# colnames(gene_counts_vst_pk) <- colData(vsd_pk)$sample
# gene_counts_vst_pk <- gene_counts_vst_pk %>%
#   mutate(gene_id = rownames(gene_counts_vst_pk)) %>%
#   left_join(transcript_info$gene, by = "gene_id")

Export RDS objects

Save all output to rds object for easier loading.

# dir.create(here("results-refseq/r_objects/"), showWarnings = FALSE)
# 
# purrr::iwalk(
#   tibble:::lst(
#     fastq_screen,
#     fastq_screen_mapping_stats_df,
#     gene_annotation,
#     gene_counts_txi,
#     gene_counts_txi_long,
#     gene_counts_txi_filtered,
#     gene_counts_txi_filtered_long,
#     gene_tpm,
#     gene_tpm_long,
#     gene_tpm_filtered,
#     gene_tpm_filtered_long,
#     gse,
#     gse_filtered,
#     gse_pk,
#     gse_pk_filtered,
#     mapping_stats_df,
#     pcr_dups,
#     samples,
#     star_stats,
#     # trancsript_info,
#     rld_filtered,
#     rld_pk_filtered,
#     rld_pk_filtered_coding,
#     vsd_filtered,
#     vsd_pk_filtered,
#     vsd_pk_filtered_coding,
#     to_long,
#   ),
#   ~ saveRDS(.x, file = here(paste0("results-refseq/r_objects/", .y, ".RDS")))
# )

Data clean up

# clean up
rm(gene_types_df)
# rm(transcript_info)
rm(globin_df)
rm(gene_tpm_gencode)
rm(gene_types_gencode_df)
rm(transcript_info_gencode)
rm(gff)
rm(p)
rm(filter_all_genes)

Analyses and figures

dir.create(here("results-refseq/figures/"), showWarnings = FALSE)

Summary of annotated genes

Number of genes per species:

gene_counts_txi %>%
  group_by(species) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(total = sum(n), rel.freq = n / total)

Number of genes per species after filtering low counts (n < 10 for all samples):

gene_counts_txi_filtered %>%
  group_by(species) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(total = sum(n), rel.freq = n / total)

Number of gene types per species:

gene_counts_txi %>%
  group_by(species, gene_type) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(total = sum(n), rel.freq = round(n / total, 5))
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.

Number of gene types per species after filtering out low counts (n < 10 for all samples):

gene_counts_txi_filtered %>%
  group_by(species, gene_type) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(total = sum(n), rel.freq = round(n / total, 5))
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
n_genes <- dim(gene_counts_txi)[1]
# stopifnot(dim(gene_counts_txi)[1] == (gene_counts_salmon %>% group_by(sample) %>% summarise(n = n()) %>% select(n) %>% unique()))
n_genes_filtered <- dim(gse_filtered)[1]

n_human_genes <- gene_counts_txi %>%
  filter(species == "human") %>%
  count() %>%
  pull()
n_human_genes_filtered <- gene_counts_txi_filtered %>%
  filter(species == "human") %>%
  count() %>%
  pull()

n_pk_genes <- dim(gse_pk)[1]
stopifnot(dim(gse_pk)[1] == (gene_counts_txi %>% filter(species == "pk") %>% count()))
n_pk_genes_filtered <- dim(gse_pk_filtered)[1]
stopifnot(dim(gse_pk_filtered)[1] == (gene_counts_txi_filtered %>% filter(species == "pk") %>% count()))

Comparison of total RNA levels

The direct comparison of RPKM and TPM across samples is meaningful only when there are equal total RNAs between compared samples and the distribution of RNA populations are close to each other.

Make sure both samples use the same RNA isolation approach [poly(A)+ selection versus ribosomal RNA depletion]. If not, they should not be compared.

Check the fraction of the ribosomal, mitochondrial and globin RNAs, and the top highly expressed transcripts and see whether such RNAs constitute a very large part of the sequenced reads in a sample, and thus decrease the sequencing “real estate” available for the remaining genes in that sample. If the calculated fractions in two samples differ significantly, do not compare RPKM or TPM values directly.

TPM should never be used for quantitative comparisons across samples when the total RNA contents and its distributions are very different. However, under appropriate circumstances, TPM can still be useful for qualitative comparison such as PCA and clustering analysis.

(Zhao et al., 2020 - http://dx.doi.org/10.1261/rna.074922.120)

Looking at our samples, especially the ones prepared using the Illumina kit exhibit much higher total RNA content. Based on the output of Picard MarkDuplicates, the differences can at least partially be attributed to the number of PCR duplicates. The proportion of duplicates is comparable across kits, but they do appear to be consistently higher for some depletion methods (no filter > centpip > plasmo > pmacs > cellulose).

gene_counts_txi_filtered %>%
  summarise(across(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`, sum)) %>%
  to_long() %>%
  left_join(samples, by = "sample") %>%
  ggplot(aes(x = sample_short, y = counts, fill = kit_name)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  # labels = c("mRNA Illumina", "mRNA Qiagen globin depletion", "mRNA Qiagen globin/rRNA depletion", "tRNA Qiagen globin/rRNA depletion")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Number of reads", fill = "Kit")

pcr_dups %>%
  left_join(samples, by = join_by(Sample == sample)) %>%
  mutate(READS_IN_DUPLICATE_PAIRS = 2 * READ_PAIR_DUPLICATES) %>%
  mutate(READS_IN_UNIQUE_PAIRS = 2 * (READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES)) %>%
  mutate(READS_IN_UNIQUE_UNPAIRED = UNPAIRED_READS_EXAMINED - UNPAIRED_READ_DUPLICATES) %>%
  mutate(READS_IN_DUPLICATE_PAIRS_OPTICAL = 2 * READ_PAIR_OPTICAL_DUPLICATES) %>%
  mutate(READS_IN_DUPLICATE_PAIRS_NONOPTICAL = READS_IN_DUPLICATE_PAIRS - READS_IN_DUPLICATE_PAIRS_OPTICAL) %>%
  mutate(READS_IN_DUPLICATE_UNPAIRED = UNPAIRED_READ_DUPLICATES) %>%
  mutate(READS_UNMAPPED = UNMAPPED_READS) %>%
  select(c(sample_name, condition, kit, READS_IN_UNIQUE_PAIRS, READS_IN_UNIQUE_UNPAIRED, READS_IN_DUPLICATE_PAIRS_OPTICAL, READS_IN_DUPLICATE_PAIRS_NONOPTICAL, READS_IN_DUPLICATE_UNPAIRED)) %>%
  pivot_longer(cols = 4:8, names_to = "type", values_to = "counts") %>%
  ggplot(aes(x = sample_name, y = counts, fill = type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Number of reads", fill = "Picard deduplication stats")

Alignment statistics

The output reported by samtool flagstats and STAR’s Log.final.out files cannot be compared directly: the latter reports counts of read-pairs as opposed to individual reads, and the former does not contain unmapped reads (these are not added to the .bam outputs by default) while also counting multi-mapped reads. We opted to report the STAR statistics, but the flagstat output is included for completion’s sake.

STAR alignment statistics

star_stats
star_stats %>%
  pivot_longer(cols = 2:6, names_to = "type", values_to = "counts") %>%
  left_join(samples, by = join_by(Sample == sample)) %>%
  ggplot(aes(x = sample_name, y = counts, fill = type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Number of read pairs", fill = "Mapping stats")

samtools flagstat

mapping_stats_df
mapping_stats_df %>%
  left_join(samples, by = "sample") %>%
  pivot_longer(cols = primary_mapped_pk:primary_mapped_human, names_to = "type", values_to = "counts") %>%
  ggplot(aes(x = sample_name, y = counts, fill = type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Number of primary reads", fill = "Samtools flagstat")

Top expressed reads in TPM

Sample 104974-001-001_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-B2M B2M NA human human 104974-001-001_S1_L001 27679.223
gene-RNR2 RNR2 NA human human_rRNA 104974-001-001_S1_L001 27104.637
gene-S100A8 S100A8 NA human human 104974-001-001_S1_L001 22578.637
gene-S100A9 S100A9 NA human human 104974-001-001_S1_L001 20902.902
gene-ND3 ND3 NA human human 104974-001-001_S1_L001 19021.278
gene-RPS27 RPS27 NA human human 104974-001-001_S1_L001 17762.379
gene-RNR1 RNR1 NA human human_rRNA 104974-001-001_S1_L001 14834.316
gene-COX3 COX3 NA human human 104974-001-001_S1_L001 14662.673
gene-ATP6 ATP6 NA human human 104974-001-001_S1_L001 11625.447
gene-TMSB4X TMSB4X NA human human 104974-001-001_S1_L001 11335.273
gene-FTL FTL NA human human 104974-001-001_S1_L001 10562.604
gene-COX2 COX2 NA human human 104974-001-001_S1_L001 10466.481
gene-RPL41 RPL41 NA human human 104974-001-001_S1_L001 9533.806
gene-ND2 ND2 NA human human 104974-001-001_S1_L001 9363.211
gene-RPLP2 RPLP2 NA human human 104974-001-001_S1_L001 9110.287
gene-RPS20 RPS20 NA human human 104974-001-001_S1_L001 8465.206
gene-RPL21 RPL21 NA human human 104974-001-001_S1_L001 8463.110
gene-RPS12 RPS12 NA human human 104974-001-001_S1_L001 7906.045
gene-COX1 COX1 NA human human 104974-001-001_S1_L001 7773.208
gene-ND1 ND1 NA human human 104974-001-001_S1_L001 7213.298
gene-RPS21 RPS21 NA human human 104974-001-001_S1_L001 7198.383
gene-EEF1A1 EEF1A1 NA human human 104974-001-001_S1_L001 6880.381
gene-RPS24 RPS24 NA human human 104974-001-001_S1_L001 6067.828
gene-RPL30 RPL30 NA human human 104974-001-001_S1_L001 5941.840
gene-B2M-2 B2M NA human human 104974-001-001_S1_L001 5852.971

Sample 104974-001-002_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-B2M B2M NA human human 104974-001-002_S1_L001 34855.539
gene-RNR2 RNR2 NA human human_rRNA 104974-001-002_S1_L001 29430.692
gene-S100A9 S100A9 NA human human 104974-001-002_S1_L001 24583.661
gene-S100A8 S100A8 NA human human 104974-001-002_S1_L001 24327.359
gene-ND3 ND3 NA human human 104974-001-002_S1_L001 16508.778
gene-RNR1 RNR1 NA human human_rRNA 104974-001-002_S1_L001 16214.239
gene-COX3 COX3 NA human human 104974-001-002_S1_L001 15349.098
gene-RPS27 RPS27 NA human human 104974-001-002_S1_L001 13599.609
gene-FTL FTL NA human human 104974-001-002_S1_L001 13013.354
gene-TMSB4X TMSB4X NA human human 104974-001-002_S1_L001 11699.378
gene-ATP6 ATP6 NA human human 104974-001-002_S1_L001 11343.645
gene-COX2 COX2 NA human human 104974-001-002_S1_L001 10378.688
gene-ND2 ND2 NA human human 104974-001-002_S1_L001 9614.317
gene-RPL21 RPL21 NA human human 104974-001-002_S1_L001 8803.669
gene-RPL41 RPL41 NA human human 104974-001-002_S1_L001 8432.933
gene-COX1 COX1 NA human human 104974-001-002_S1_L001 8236.118
gene-RPS12 RPS12 NA human human 104974-001-002_S1_L001 8169.655
gene-RPLP2 RPLP2 NA human human 104974-001-002_S1_L001 7483.747
gene-ND1 ND1 NA human human 104974-001-002_S1_L001 7330.897
gene-EEF1A1 EEF1A1 NA human human 104974-001-002_S1_L001 7075.555
gene-B2M-2 B2M NA human human 104974-001-002_S1_L001 7055.202
gene-HBA2 HBA2 NA human human_globin 104974-001-002_S1_L001 6973.821
gene-RPS20 RPS20 NA human human 104974-001-002_S1_L001 6841.469
gene-SRGN SRGN NA human human 104974-001-002_S1_L001 6536.149
gene-ND4 ND4 NA human human 104974-001-002_S1_L001 5657.471

Sample 104974-001-003_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBA2 HBA2 NA human human_globin 104974-001-003_S1_L001 46533.376
gene-FTL FTL NA human human 104974-001-003_S1_L001 29467.269
gene-HBB HBB NA human human_globin 104974-001-003_S1_L001 27466.991
gene-RPL21 RPL21 NA human human 104974-001-003_S1_L001 23643.612
gene-UBA52 UBA52 NA human human 104974-001-003_S1_L001 21219.997
gene-RPS12 RPS12 NA human human 104974-001-003_S1_L001 20864.756
gene-RNR2 RNR2 NA human human_rRNA 104974-001-003_S1_L001 19293.910
gene-UBB UBB NA human human 104974-001-003_S1_L001 17736.970
gene-HBA1 HBA1 NA human human_globin 104974-001-003_S1_L001 16262.189
gene-B2M B2M NA human human 104974-001-003_S1_L001 14549.406
gene-GYPC GYPC NA human human 104974-001-003_S1_L001 12990.299
gene-RPL41 RPL41 NA human human 104974-001-003_S1_L001 11766.569
gene-OAZ1 OAZ1 NA human human 104974-001-003_S1_L001 9783.310
gene-FKBP8 FKBP8 NA human human 104974-001-003_S1_L001 8678.414
gene-S100A8 S100A8 NA human human 104974-001-003_S1_L001 7978.580
gene-S100A9 S100A9 NA human human 104974-001-003_S1_L001 7889.953
gene-SLC25A39 SLC25A39 NA human human 104974-001-003_S1_L001 7762.571
gene-RPS27 RPS27 NA human human 104974-001-003_S1_L001 7712.087
gene-RPLP2 RPLP2 NA human human 104974-001-003_S1_L001 7343.200
gene-RNR1 RNR1 NA human human_rRNA 104974-001-003_S1_L001 6908.576
gene-RPL30 RPL30 NA human human 104974-001-003_S1_L001 6190.096
gene-COX3 COX3 NA human human 104974-001-003_S1_L001 6067.999
gene-RPS11 RPS11 NA human human 104974-001-003_S1_L001 5505.873
gene-ND3 ND3 NA human human 104974-001-003_S1_L001 5258.114
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-003_S1_L001 5022.868

Sample 104974-001-004_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBA2 HBA2 NA human human_globin 104974-001-004_S1_L001 52755.665
gene-HBB HBB NA human human_globin 104974-001-004_S1_L001 22097.339
gene-HBA1 HBA1 NA human human_globin 104974-001-004_S1_L001 21210.681
gene-FTL FTL NA human human 104974-001-004_S1_L001 19832.883
gene-UBA52 UBA52 NA human human 104974-001-004_S1_L001 13148.031
gene-RPL21 RPL21 NA human human 104974-001-004_S1_L001 12112.246
gene-RPS12 RPS12 NA human human 104974-001-004_S1_L001 11859.816
gene-B2M B2M NA human human 104974-001-004_S1_L001 10750.176
gene-UBB UBB NA human human 104974-001-004_S1_L001 9926.561
gene-RNR2 RNR2 NA human human_rRNA 104974-001-004_S1_L001 9832.230
gene-GYPC GYPC NA human human 104974-001-004_S1_L001 8986.872
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-004_S1_L001 8314.573
gene-S100A8 S100A8 NA human human 104974-001-004_S1_L001 7066.256
gene-S100A9 S100A9 NA human human 104974-001-004_S1_L001 7006.419
gene-FKBP8 FKBP8 NA human human 104974-001-004_S1_L001 6986.322
gene-COX3 COX3 NA human human 104974-001-004_S1_L001 6966.357
gene-OAZ1 OAZ1 NA human human 104974-001-004_S1_L001 6785.591
gene-RPL41 RPL41 NA human human 104974-001-004_S1_L001 6779.247
gene-COX1 COX1 NA human human 104974-001-004_S1_L001 6315.816
gene-RPS27 RPS27 NA human human 104974-001-004_S1_L001 6261.843
gene-SLC25A39 SLC25A39 NA human human 104974-001-004_S1_L001 6113.571
gene-ATP6 ATP6 NA human human 104974-001-004_S1_L001 6088.562
gene-ND3 ND3 NA human human 104974-001-004_S1_L001 5573.493
gene-COX2 COX2 NA human human 104974-001-004_S1_L001 5157.936
gene-RPLP2 RPLP2 NA human human 104974-001-004_S1_L001 5087.297

Sample 104974-001-005_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBA2 HBA2 NA human human_globin 104974-001-005_S1_L001 49591.793
gene-FTL FTL NA human human 104974-001-005_S1_L001 44351.515
gene-UBA52 UBA52 NA human human 104974-001-005_S1_L001 36767.580
gene-RPL21 RPL21 NA human human 104974-001-005_S1_L001 35030.120
gene-RPS12 RPS12 NA human human 104974-001-005_S1_L001 30090.973
gene-HBB HBB NA human human_globin 104974-001-005_S1_L001 29687.257
gene-UBB UBB NA human human 104974-001-005_S1_L001 26975.937
gene-RNR2 RNR2 NA human human_rRNA 104974-001-005_S1_L001 26033.024
gene-GYPC GYPC NA human human 104974-001-005_S1_L001 20473.960
gene-RPL41 RPL41 NA human human 104974-001-005_S1_L001 17038.159
gene-HBA1 HBA1 NA human human_globin 104974-001-005_S1_L001 16668.188
gene-OAZ1 OAZ1 NA human human 104974-001-005_S1_L001 14476.128
gene-FKBP8 FKBP8 NA human human 104974-001-005_S1_L001 13027.216
gene-SLC25A39 SLC25A39 NA human human 104974-001-005_S1_L001 12949.204
gene-RNR1 RNR1 NA human human_rRNA 104974-001-005_S1_L001 8443.443
gene-RPLP2 RPLP2 NA human human 104974-001-005_S1_L001 7333.275
gene-RPS11 RPS11 NA human human 104974-001-005_S1_L001 7058.348
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-005_S1_L001 6826.241
gene-RPL30 RPL30 NA human human 104974-001-005_S1_L001 6393.033
gene-GUK1 GUK1 NA human human 104974-001-005_S1_L001 6026.648
gene-RPL12 RPL12 NA human human 104974-001-005_S1_L001 5719.974
gene-ADIPOR1 ADIPOR1 NA human human 104974-001-005_S1_L001 5600.955
gene-PKNH_1355500 gene-PKNH_1355500 protein_coding pk pk 104974-001-005_S1_L001 5244.759
gene-RPS15 RPS15 NA human human 104974-001-005_S1_L001 4708.231
gene-RPS24 RPS24 NA human human 104974-001-005_S1_L001 4399.440

Sample 104974-001-006_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-B2M B2M NA human human 104974-001-006_S1_L001 28553.397
gene-COX3 COX3 NA human human 104974-001-006_S1_L001 19236.293
gene-S100A9 S100A9 NA human human 104974-001-006_S1_L001 18303.677
gene-TMSB4X TMSB4X NA human human 104974-001-006_S1_L001 18124.259
gene-S100A8 S100A8 NA human human 104974-001-006_S1_L001 16150.164
gene-B2M-2 B2M NA human human 104974-001-006_S1_L001 15785.354
gene-ND3 ND3 NA human human 104974-001-006_S1_L001 15074.849
gene-COX1 COX1 NA human human 104974-001-006_S1_L001 14709.890
gene-COX2 COX2 NA human human 104974-001-006_S1_L001 14454.597
gene-ND2 ND2 NA human human 104974-001-006_S1_L001 14204.682
gene-FTL FTL NA human human 104974-001-006_S1_L001 11557.118
gene-ND1 ND1 NA human human 104974-001-006_S1_L001 10850.009
gene-ATP6 ATP6 NA human human 104974-001-006_S1_L001 10710.461
gene-RPS27 RPS27 NA human human 104974-001-006_S1_L001 9942.583
gene-ND4 ND4 NA human human 104974-001-006_S1_L001 9064.038
gene-ATP8 ATP8 NA human human 104974-001-006_S1_L001 8615.542
gene-EEF1A1 EEF1A1 NA human human 104974-001-006_S1_L001 8459.874
gene-RPS20 RPS20 NA human human 104974-001-006_S1_L001 7580.700
gene-CYTB CYTB NA human human 104974-001-006_S1_L001 7165.345
gene-RPL41 RPL41 NA human human 104974-001-006_S1_L001 7005.028
gene-RPL19 RPL19 NA human human 104974-001-006_S1_L001 6904.254
gene-RPL21 RPL21 NA human human 104974-001-006_S1_L001 6793.811
gene-RPL34 RPL34 NA human human 104974-001-006_S1_L001 5924.000
gene-RPL9 RPL9 NA human human 104974-001-006_S1_L001 5814.001
gene-FTH1 FTH1 NA human human 104974-001-006_S1_L001 5571.215

Sample 104974-001-007_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-RN7SL2 RN7SL2 NA human human 104974-001-007_S1_L001 131068.139
gene-RN7SL1 RN7SL1 NA human human 104974-001-007_S1_L001 92664.227
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-007_S1_L001 73099.379
gene-RN7SK RN7SK NA human human 104974-001-007_S1_L001 70738.871
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-007_S1_L001 58787.433
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-007_S1_L001 27365.645
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-007_S1_L001 25716.594
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-007_S1_L001 17270.634
gene-RMRP RMRP NA human human 104974-001-007_S1_L001 14366.099
gene-B2M B2M NA human human 104974-001-007_S1_L001 12165.333
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-007_S1_L001 8712.438
gene-TMSB4X TMSB4X NA human human 104974-001-007_S1_L001 7823.012
gene-COX3 COX3 NA human human 104974-001-007_S1_L001 7685.253
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-007_S1_L001 7579.708
gene-ND2 ND2 NA human human 104974-001-007_S1_L001 6852.100
gene-COX1 COX1 NA human human 104974-001-007_S1_L001 6031.814
gene-COX2 COX2 NA human human 104974-001-007_S1_L001 5463.136
gene-ATP6 ATP6 NA human human 104974-001-007_S1_L001 5451.582
gene-S100A8 S100A8 NA human human 104974-001-007_S1_L001 5309.715
gene-ND1 ND1 NA human human 104974-001-007_S1_L001 5106.975
gene-ND4 ND4 NA human human 104974-001-007_S1_L001 4979.826
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-007_S1_L001 4468.936
gene-ND3 ND3 NA human human 104974-001-007_S1_L001 4455.028
gene-S100A9 S100A9 NA human human 104974-001-007_S1_L001 4377.385
gene-RPPH1 RPPH1 NA human human 104974-001-007_S1_L001 4178.458

Sample 104974-001-008_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-RN7SL2 RN7SL2 NA human human 104974-001-008_S1_L001 139279.456
gene-RN7SL1 RN7SL1 NA human human 104974-001-008_S1_L001 88237.728
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-008_S1_L001 78313.591
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-008_S1_L001 76693.896
gene-RN7SK RN7SK NA human human 104974-001-008_S1_L001 76096.199
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-008_S1_L001 30296.682
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-008_S1_L001 27906.598
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-008_S1_L001 23171.875
gene-RMRP RMRP NA human human 104974-001-008_S1_L001 16032.224
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-008_S1_L001 14289.531
gene-B2M B2M NA human human 104974-001-008_S1_L001 12222.046
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-008_S1_L001 9740.299
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-008_S1_L001 7697.558
gene-COX3 COX3 NA human human 104974-001-008_S1_L001 7411.967
gene-TMSB4X TMSB4X NA human human 104974-001-008_S1_L001 6867.503
gene-ND2 ND2 NA human human 104974-001-008_S1_L001 5897.639
gene-COX1 COX1 NA human human 104974-001-008_S1_L001 5487.208
gene-S100A8 S100A8 NA human human 104974-001-008_S1_L001 5483.937
gene-COX2 COX2 NA human human 104974-001-008_S1_L001 5235.476
gene-RPPH1 RPPH1 NA human human 104974-001-008_S1_L001 5041.748
gene-RPPH1-2 RPPH1 NA human human 104974-001-008_S1_L001 4996.489
gene-ATP6 ATP6 NA human human 104974-001-008_S1_L001 4855.997
gene-S100A9 S100A9 NA human human 104974-001-008_S1_L001 4804.481
gene-ND1 ND1 NA human human 104974-001-008_S1_L001 4769.538
gene-ND4 ND4 NA human human 104974-001-008_S1_L001 4491.039

Sample 104974-001-009_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-RN7SL2 RN7SL2 NA human human 104974-001-009_S1_L001 145681.116
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-009_S1_L001 144323.085
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-009_S1_L001 113693.416
gene-RN7SL1 RN7SL1 NA human human 104974-001-009_S1_L001 97956.989
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-009_S1_L001 58482.134
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-009_S1_L001 54071.123
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-009_S1_L001 53881.108
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-009_S1_L001 24293.746
gene-RN7SK RN7SK NA human human 104974-001-009_S1_L001 20907.443
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-009_S1_L001 18472.293
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-009_S1_L001 12768.772
gene-HBB HBB NA human human_globin 104974-001-009_S1_L001 6401.856
gene-FTL FTL NA human human 104974-001-009_S1_L001 5156.650
gene-UBB UBB NA human human 104974-001-009_S1_L001 3804.390
gene-ADIPOR1 ADIPOR1 NA human human 104974-001-009_S1_L001 3777.510
gene-RPPH1-2 RPPH1 NA human human 104974-001-009_S1_L001 3718.740
gene-RPPH1 RPPH1 NA human human 104974-001-009_S1_L001 3692.364
gene-RNY1 RNY1 NA human human 104974-001-009_S1_L001 3687.490
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-009_S1_L001 3199.299
gene-GYPC GYPC NA human human 104974-001-009_S1_L001 3180.375
gene-RPL21 RPL21 NA human human 104974-001-009_S1_L001 2826.477
gene-HBA2 HBA2 NA human human_globin 104974-001-009_S1_L001 2678.665
gene-RMRP RMRP NA human human 104974-001-009_S1_L001 2494.538
gene-UBA52 UBA52 NA human human 104974-001-009_S1_L001 2312.951
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-009_S1_L001 2094.309

Sample 104974-001-010_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-010_S1_L001 181501.107
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-010_S1_L001 143352.902
gene-RN7SL2 RN7SL2 NA human human 104974-001-010_S1_L001 86488.492
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-010_S1_L001 70146.903
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-010_S1_L001 66180.869
gene-RN7SL1 RN7SL1 NA human human 104974-001-010_S1_L001 60353.922
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-010_S1_L001 54735.717
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-010_S1_L001 20313.046
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-010_S1_L001 16268.334
gene-RN7SK RN7SK NA human human 104974-001-010_S1_L001 13867.096
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-010_S1_L001 9108.890
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-010_S1_L001 3993.278
gene-HBA2 HBA2 NA human human_globin 104974-001-010_S1_L001 3889.037
gene-HBB HBB NA human human_globin 104974-001-010_S1_L001 3806.526
gene-ADIPOR1 ADIPOR1 NA human human 104974-001-010_S1_L001 3607.743
gene-FTL FTL NA human human 104974-001-010_S1_L001 3459.588
gene-HBA1 HBA1 NA human human_globin 104974-001-010_S1_L001 3183.081
gene-GYPC GYPC NA human human 104974-001-010_S1_L001 2799.467
gene-RPPH1 RPPH1 NA human human 104974-001-010_S1_L001 2659.526
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-010_S1_L001 2645.063
gene-RPPH1-2 RPPH1 NA human human 104974-001-010_S1_L001 2627.532
gene-PKNH_1132600 gene-PKNH_1132600 protein_coding pk pk 104974-001-010_S1_L001 2592.234
gene-COX1 COX1 NA human human 104974-001-010_S1_L001 2414.909
gene-UBB UBB NA human human 104974-001-010_S1_L001 2265.004
gene-RMRP RMRP NA human human 104974-001-010_S1_L001 2211.677

Sample 104974-001-011_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-RN7SL2 RN7SL2 NA human human 104974-001-011_S1_L001 161042.174
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-011_S1_L001 147935.000
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-011_S1_L001 115582.956
gene-RN7SL1 RN7SL1 NA human human 104974-001-011_S1_L001 108220.532
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-011_S1_L001 61256.956
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-011_S1_L001 55243.977
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-011_S1_L001 42015.846
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-011_S1_L001 18712.764
gene-RN7SK RN7SK NA human human 104974-001-011_S1_L001 10855.497
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-011_S1_L001 10217.904
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-011_S1_L001 7907.086
gene-FTL FTL NA human human 104974-001-011_S1_L001 6860.040
gene-HBB HBB NA human human_globin 104974-001-011_S1_L001 5702.090
gene-ADIPOR1 ADIPOR1 NA human human 104974-001-011_S1_L001 5452.125
gene-UBB UBB NA human human 104974-001-011_S1_L001 5000.284
gene-GYPC GYPC NA human human 104974-001-011_S1_L001 4800.788
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-011_S1_L001 4511.897
gene-RPL21 RPL21 NA human human 104974-001-011_S1_L001 4355.019
gene-RPPH1-2 RPPH1 NA human human 104974-001-011_S1_L001 3979.430
gene-RPPH1 RPPH1 NA human human 104974-001-011_S1_L001 3978.027
gene-SLC25A39 SLC25A39 NA human human 104974-001-011_S1_L001 3085.912
gene-RNY1 RNY1 NA human human 104974-001-011_S1_L001 2933.827
gene-UBA52 UBA52 NA human human 104974-001-011_S1_L001 2915.853
gene-OAZ1 OAZ1 NA human human 104974-001-011_S1_L001 2894.726
gene-HBA2 HBA2 NA human human_globin 104974-001-011_S1_L001 2852.671

Sample 104974-001-012_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-012_S1_L001 366480.180
gene-HBA2 HBA2 NA human human_globin 104974-001-012_S1_L001 263245.083
gene-HBA1 HBA1 NA human human_globin 104974-001-012_S1_L001 98009.922
gene-B2M B2M NA human human 104974-001-012_S1_L001 6733.582
gene-RNR2 RNR2 NA human human_rRNA 104974-001-012_S1_L001 5784.927
gene-S100A9 S100A9 NA human human 104974-001-012_S1_L001 5749.014
gene-S100A8 S100A8 NA human human 104974-001-012_S1_L001 4593.144
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-012_S1_L001 4190.916
gene-LOC124905315 LOC124905315 NA human human 104974-001-012_S1_L001 3825.278
gene-RNR1 RNR1 NA human human_rRNA 104974-001-012_S1_L001 3549.196
gene-TMSB4X TMSB4X NA human human 104974-001-012_S1_L001 3347.079
gene-RPS27 RPS27 NA human human 104974-001-012_S1_L001 3045.938
gene-FTL FTL NA human human 104974-001-012_S1_L001 3016.675
gene-ND3 ND3 NA human human 104974-001-012_S1_L001 2750.724
gene-COX3 COX3 NA human human 104974-001-012_S1_L001 2577.031
gene-RPL41 RPL41 NA human human 104974-001-012_S1_L001 2375.402
gene-RPS12 RPS12 NA human human 104974-001-012_S1_L001 2212.215
gene-COX2 COX2 NA human human 104974-001-012_S1_L001 2207.033
gene-RPL39 RPL39 NA human human 104974-001-012_S1_L001 2132.982
gene-ATP8 ATP8 NA human human 104974-001-012_S1_L001 1919.579
gene-EEF1A1 EEF1A1 NA human human 104974-001-012_S1_L001 1903.976
gene-RNA28SN2 RNA28SN2 NA human human_rRNA 104974-001-012_S1_L001 1875.808
gene-ATP6 ATP6 NA human human 104974-001-012_S1_L001 1867.009
gene-RNA18SN2 RNA18SN2 NA human human_rRNA 104974-001-012_S1_L001 1677.114
gene-RNA18SN3 RNA18SN3 NA human human_rRNA 104974-001-012_S1_L001 1671.307

Sample 104974-001-013_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-013_S1_L001 388222.402
gene-HBA2 HBA2 NA human human_globin 104974-001-013_S1_L001 289856.854
gene-HBA1 HBA1 NA human human_globin 104974-001-013_S1_L001 103444.535
gene-S100A9 S100A9 NA human human 104974-001-013_S1_L001 6019.643
gene-B2M B2M NA human human 104974-001-013_S1_L001 5886.942
gene-RNR2 RNR2 NA human human_rRNA 104974-001-013_S1_L001 5240.822
gene-S100A8 S100A8 NA human human 104974-001-013_S1_L001 4629.461
gene-RNR1 RNR1 NA human human_rRNA 104974-001-013_S1_L001 3265.530
gene-FTL FTL NA human human 104974-001-013_S1_L001 3136.281
gene-TMSB4X TMSB4X NA human human 104974-001-013_S1_L001 2718.495
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-013_S1_L001 2457.827
gene-COX3 COX3 NA human human 104974-001-013_S1_L001 2138.235
gene-ND3 ND3 NA human human 104974-001-013_S1_L001 2125.318
gene-RPS27 RPS27 NA human human 104974-001-013_S1_L001 2021.781
gene-RPS12 RPS12 NA human human 104974-001-013_S1_L001 1910.314
gene-RPL41 RPL41 NA human human 104974-001-013_S1_L001 1851.973
gene-LOC124905315 LOC124905315 NA human human 104974-001-013_S1_L001 1741.411
gene-COX2 COX2 NA human human 104974-001-013_S1_L001 1678.269
gene-FTH1 FTH1 NA human human 104974-001-013_S1_L001 1635.636
gene-RPL39 RPL39 NA human human 104974-001-013_S1_L001 1558.862
gene-ATP8 ATP8 NA human human 104974-001-013_S1_L001 1532.902
gene-EEF1A1 EEF1A1 NA human human 104974-001-013_S1_L001 1446.312
gene-ATP6 ATP6 NA human human 104974-001-013_S1_L001 1419.527
gene-RPL21 RPL21 NA human human 104974-001-013_S1_L001 1331.099
gene-COX1 COX1 NA human human 104974-001-013_S1_L001 1240.701

Sample 104974-001-014_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-014_S1_L001 523264.4084
gene-HBA2 HBA2 NA human human_globin 104974-001-014_S1_L001 284480.8538
gene-HBA1 HBA1 NA human human_globin 104974-001-014_S1_L001 114371.1571
gene-LOC124905315 LOC124905315 NA human human 104974-001-014_S1_L001 2918.1146
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-014_S1_L001 2536.0602
gene-FTL FTL NA human human 104974-001-014_S1_L001 1871.6873
gene-RNA18SN3-2 RNA18SN3 NA human human_rRNA 104974-001-014_S1_L001 1543.4707
gene-RNA18SN4 RNA18SN4 NA human human_rRNA 104974-001-014_S1_L001 1542.5888
gene-RNA18SN3 RNA18SN3 NA human human_rRNA 104974-001-014_S1_L001 1542.4997
gene-RNA18SN2 RNA18SN2 NA human human_rRNA 104974-001-014_S1_L001 1535.9067
gene-RPS12 RPS12 NA human human 104974-001-014_S1_L001 1366.6827
gene-RNA28SN2 RNA28SN2 NA human human_rRNA 104974-001-014_S1_L001 1296.8516
gene-RNR2 RNR2 NA human human_rRNA 104974-001-014_S1_L001 1166.9362
gene-UBA52 UBA52 NA human human 104974-001-014_S1_L001 1068.1666
gene-RPL21 RPL21 NA human human 104974-001-014_S1_L001 1028.5515
gene-UBB UBB NA human human 104974-001-014_S1_L001 892.5544
gene-GYPC GYPC NA human human 104974-001-014_S1_L001 851.2235
gene-HBG2 HBG2 NA human human_globin 104974-001-014_S1_L001 842.1872
gene-FKBP8 FKBP8 NA human human 104974-001-014_S1_L001 814.0800
gene-B2M B2M NA human human 104974-001-014_S1_L001 783.0767
gene-RPL41 RPL41 NA human human 104974-001-014_S1_L001 715.1521
gene-RNA5-8SN4 RNA5-8SN4 NA human human_rRNA 104974-001-014_S1_L001 636.0227
gene-OAZ1 OAZ1 NA human human 104974-001-014_S1_L001 599.3413
gene-S100A9 S100A9 NA human human 104974-001-014_S1_L001 578.5453
gene-SLC25A39 SLC25A39 NA human human 104974-001-014_S1_L001 555.1666

Sample 104974-001-015_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-015_S1_L001 448423.0722
gene-HBA2 HBA2 NA human human_globin 104974-001-015_S1_L001 316409.7173
gene-HBA1 HBA1 NA human human_globin 104974-001-015_S1_L001 127791.9575
gene-LOC124905315 LOC124905315 NA human human 104974-001-015_S1_L001 3596.4548
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-015_S1_L001 3411.2697
gene-RNA18SN2 RNA18SN2 NA human human_rRNA 104974-001-015_S1_L001 1762.2669
gene-RNA18SN4 RNA18SN4 NA human human_rRNA 104974-001-015_S1_L001 1756.8958
gene-RNA18SN3 RNA18SN3 NA human human_rRNA 104974-001-015_S1_L001 1753.9691
gene-RNA18SN3-2 RNA18SN3 NA human human_rRNA 104974-001-015_S1_L001 1750.1382
gene-FTL FTL NA human human 104974-001-015_S1_L001 1660.3870
gene-RNA28SN2 RNA28SN2 NA human human_rRNA 104974-001-015_S1_L001 1615.4719
gene-RPS12 RPS12 NA human human 104974-001-015_S1_L001 1099.8989
gene-UBA52 UBA52 NA human human 104974-001-015_S1_L001 1020.6326
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-015_S1_L001 925.7916
gene-FKBP8 FKBP8 NA human human 104974-001-015_S1_L001 917.5296
gene-GYPC GYPC NA human human 104974-001-015_S1_L001 858.6056
gene-B2M B2M NA human human 104974-001-015_S1_L001 792.5917
gene-RPL21 RPL21 NA human human 104974-001-015_S1_L001 779.4010
gene-RNR2 RNR2 NA human human_rRNA 104974-001-015_S1_L001 768.8011
gene-UBB UBB NA human human 104974-001-015_S1_L001 734.8697
gene-S100A9 S100A9 NA human human 104974-001-015_S1_L001 692.6973
gene-HBG2 HBG2 NA human human_globin 104974-001-015_S1_L001 663.0136
gene-OAZ1 OAZ1 NA human human 104974-001-015_S1_L001 576.5540
gene-SLC25A39 SLC25A39 NA human human 104974-001-015_S1_L001 567.3799
gene-RPL41 RPL41 NA human human 104974-001-015_S1_L001 566.2701

Sample 104974-001-016_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-016_S1_L001 514171.8546
gene-HBA2 HBA2 NA human human_globin 104974-001-016_S1_L001 301490.6876
gene-HBA1 HBA1 NA human human_globin 104974-001-016_S1_L001 122126.6477
gene-LOC124905315 LOC124905315 NA human human 104974-001-016_S1_L001 3512.6965
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-016_S1_L001 3033.8382
gene-RNA18SN2 RNA18SN2 NA human human_rRNA 104974-001-016_S1_L001 2096.6635
gene-RNA18SN3 RNA18SN3 NA human human_rRNA 104974-001-016_S1_L001 2086.6004
gene-RNA18SN4 RNA18SN4 NA human human_rRNA 104974-001-016_S1_L001 2073.9575
gene-RNA18SN3-2 RNA18SN3 NA human human_rRNA 104974-001-016_S1_L001 2066.1062
gene-FTL FTL NA human human 104974-001-016_S1_L001 1780.1712
gene-RNA28SN2 RNA28SN2 NA human human_rRNA 104974-001-016_S1_L001 1534.3552
gene-RPS12 RPS12 NA human human 104974-001-016_S1_L001 1288.8221
gene-UBA52 UBA52 NA human human 104974-001-016_S1_L001 1087.6524
gene-RPL21 RPL21 NA human human 104974-001-016_S1_L001 990.7414
gene-RNR2 RNR2 NA human human_rRNA 104974-001-016_S1_L001 990.5111
gene-UBB UBB NA human human 104974-001-016_S1_L001 922.4129
gene-HBG2 HBG2 NA human human_globin 104974-001-016_S1_L001 911.4972
gene-GYPC GYPC NA human human 104974-001-016_S1_L001 883.1664
gene-FKBP8 FKBP8 NA human human 104974-001-016_S1_L001 818.6246
gene-OAZ1 OAZ1 NA human human 104974-001-016_S1_L001 600.4898
gene-RPL41 RPL41 NA human human 104974-001-016_S1_L001 559.7779
gene-SLC25A39 SLC25A39 NA human human 104974-001-016_S1_L001 555.5004
gene-RNA5-8SN4 RNA5-8SN4 NA human human_rRNA 104974-001-016_S1_L001 525.1435
gene-RN7SL2 RN7SL2 NA human human 104974-001-016_S1_L001 410.6589
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-016_S1_L001 380.8039

FastQ Screen statistics

Also see figure in results-refseq/fastq-screen/ and results-refseq/multiqc-fastq-screen-samtools-pk/multiqc_report.html.

All reads for both species

fastq_screen_transformed <- bind_rows(
  fastq_screen %>%
    mutate(`%_single_genome` = `%_One_hit_one_genome` + `%_Multiple_hits_one_genome`) %>%
    mutate(`%_multiple_genomes` = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
    mutate(`No_hits` = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
    select(c(sample, Genome, `#Reads_processed`, `%_single_genome`, `%_multiple_genomes`)) %>%
    pivot_wider(names_from = Genome, values_from = c(`%_single_genome`, `%_multiple_genomes`)) %>%
    select(!`%_multiple_genomes_PknowlesiH`) %>%
    rename(`%_multiple_genomes` = `%_multiple_genomes_Human`) %>%
    mutate(pct = 1 - `%_single_genome_PknowlesiH` - `%_single_genome_Human` - `%_multiple_genomes`) %>%
    select(sample, pct) %>%
    mutate("Mapped reads" = "No hits"),
  fastq_screen %>%
    mutate(pct = `%_One_hit_one_genome` + `%_Multiple_hits_one_genome`) %>%
    select(c(sample, Genome, pct)) %>%
    rename("Mapped reads" = Genome),
  fastq_screen %>%
    mutate(pct = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
    select(c(sample, pct)) %>%
    distinct(sample, .keep_all = TRUE) %>%
    mutate("Mapped reads" = "Multiple genomes")
) %>%
  # change order of levels here to adjust colours
  mutate(`Mapped reads` = factor(`Mapped reads`,
    levels = c("PknowlesiH", "Human", "Multiple genomes", "No hits"),
    labels = c("P. knowlesi", "Human", "Multiple hits", "No hits")
  )) %>%
  left_join(samples, by = "sample")

ggplot(fastq_screen_transformed, aes(x = sample_short, y = pct, fill = `Mapped reads`)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T, limits = rev) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = expression("FastQ Screen mapped reads"), x = "Sample") +
  ylab(label = "% of mapped reads")

Human reads only

# Human reads
fastq_screen %>%
  pivot_longer(
    cols = starts_with("%"),
    names_to = "fastq_screen_pct_type",
    values_to = "fastq_screen_pct_value"
  ) %>%
  filter(Genome == "Human") %>%
  left_join(samples) %>%
  ggplot(aes(x = sample_short, y = fastq_screen_pct_value, fill = fastq_screen_pct_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = "FastQ screen statistics for human reads", x = "Sample", y = "% of mapped reads")
## Joining with `by = join_by(sample)`

Pk reads

# Pk reads
fastq_screen %>%
  pivot_longer(
    cols = starts_with("%"),
    names_to = "fastq_screen_pct_type",
    values_to = "fastq_screen_pct_value"
  ) %>%
  filter(Genome == "PknowlesiH") %>%
  left_join(samples) %>%
  ggplot(aes(x = sample_short, y = fastq_screen_pct_value, fill = fastq_screen_pct_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = expression("FastQ screen statistics for" ~ italic("P. knowlesi") ~ "reads"), x = "Sample", y = "% of mapped reads")
## Joining with `by = join_by(sample)`

rRNA and globin levels

Barplot of TPM for rRNA genes per condition

gene_annotation %>%
  # select rRNA and globins
  filter(!gene_type %in% c("human", "pk")) %>%
  left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
  pivot_longer(
    cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(species, gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts, fill = sample_short)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = "Sample") +
  ylab(label = "TPM") +
  xlab(label = "Gene name") +
  facet_wrap(~kit_name) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))

Dot plot of log(TPM+1) for rRNA genes per condition

Log TPM is often plotted instead of raw values, but can be misleading on barplot…

gene_annotation %>%
  filter(!gene_type %in% c("human", "pk")) %>%
  left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
  pivot_longer(
    cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(species, gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts)) +
  geom_point(aes(colour = sample_name)) +
  scale_colour_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  # labs(colour = "Sample") +
  labs(x = "Gene name", y = "log(TPM+1)", colour = "Sample") +
  facet_wrap(~sample_name) +
  scale_y_continuous(trans = "log1p")

Globin only

gene_annotation %>%
  filter(gene_type == "human_globin") %>%
  left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
  pivot_longer(
    cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(species, gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts)) +
  geom_point(aes(colour = sample)) +
  scale_colour_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(x = "Gene name", y = "TPM", colour = "Sample") +
  facet_wrap(~sample_name) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))

Human rRNA only

gene_annotation %>%
  filter(gene_type == "human_rRNA") %>%
  left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
  pivot_longer(
    cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(species, gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts)) +
  geom_point(aes(colour = sample)) +
  scale_colour_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(x = "Gene name", y = "TPM", colour = "Sample") +
  facet_wrap(~sample_name) +
  scale_y_continuous(trans = "log1p")

Plot percentage of reads for each gene type (rRNA/globin) and species

TPM values appear to be the most appropriate metric to use.

Gene-level counts

These gene counts were aggregated across all samples (tximport of transcript counts) and filtered to only genes with a minimum count of 10 reads for at least 1 sample.

gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of gene counts", fill = "RNA type")

Gene-level TPM

Filtered to only genes with a minimum count of 10 reads for at least 1 sample.

gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type")

Plot percentage of reads for each species

Gene-level counts

These gene counts were aggregated across all samples (tximport of transcript counts) and filtered to only genes with a minimum count of 10 reads for at least 1 sample.

# absolute numbers
gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of gene counts", fill = "Species")

# proportion
gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
  geom_bar(position = "fill", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Proportion of gene counts", fill = "Species")

Gene-level TPM

Note that since TPM is already scaled by library size, the absolute and proportion plots are identical.

# absolute numbers
gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of TPM counts", fill = "Species")

# proportion
gene_tpm_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
  geom_bar(position = "fill", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Proportion of TPM counts", fill = "Species")

Gene recovery

# create binary matrices with presence/absence of genes
gene_tpm_filtered_binary <- gene_tpm_filtered %>%
  mutate(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, .fns = ~ .x > 0)) %>%
  select(!c(gene_name, gene_type, species))

gene_tpm_pk_filtered_binary <- gene_tpm_filtered %>%
  mutate(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, .fns = ~ .x > 0)) %>%
  filter(species == "pk") %>%
  select(!c(gene_name, gene_type, species))

gene_tpm_filtered_binary_list <- purrr::set_names(colnames(gene_tpm_filtered_binary %>% select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`))) %>%
  purrr::imap(~ gene_tpm_filtered_binary$gene_id[gene_tpm_filtered_binary[, .x, drop = T]])

gene_tpm_pk_filtered_binary_list <- purrr::set_names(colnames(gene_tpm_pk_filtered_binary %>% select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`))) %>%
  purrr::imap(~ gene_tpm_pk_filtered_binary$gene_id[gene_tpm_pk_filtered_binary[, .x, drop = T]])

comparison_centpip <- samples %>%
  filter(WBC_depletion == "centpip") %>%
  select(sample)

comparison_plasmo <- samples %>%
  filter(WBC_depletion == "plasmo") %>%
  select(sample)

comparison_pmacs <- samples %>%
  filter(WBC_depletion == "pmacs") %>%
  select(sample)

comparison_cellulose <- samples %>%
  filter(WBC_depletion == "cellulose") %>%
  select(sample)

comparison_nofilter <- samples %>%
  filter(WBC_depletion == "nofilter") %>%
  filter(!sample == "104974-001-006_S1_L001")

comparison_mrna_qiagen <- samples %>%
  filter(kit == "mRNA_qiagen_FSglob")

comparison_trna_qiagen <- samples %>%
  filter(kit == "tRNA_qiagen_FSglobH")

comparison_mrna_illumina <- samples %>%
  filter(kit == "mRNA_illumina")

draw_venn <- function(set_list, names, title, output) {
  VennDiagram::venn.diagram(
    x = set_list,
    category.names = names,
    # Circles
    lwd = 2,
    # lty = "blank",
    # col = palette.colors(3),
    # fill = palette.colors(3),
    # col = hcl.colors(3, palette = "viridis", alpha = 0.9),
    # fill = hcl.colors(3, palette = "viridis", alpha = 0.4),
    col = hcl.colors(3, palette = "Set 2", alpha = 0.9),
    fill = hcl.colors(3, palette = "Set 2", alpha = 0.4),

    # Numbers
    cex = .9,
    fontface = "bold",
    fontfamily = "sans",

    # Set names
    cat.cex = 0.9,
    cat.fontface = "bold",
    cat.default.pos = "outer",
    cat.pos = c(-20, 20, 135),
    cat.dist = c(0.055, 0.055, 0.11),
    cat.fontfamily = "sans",
    rotation = 1,

    # title
    main = title,
    main.cex = 1.1,
    main.fontface = "bold",
    main.fontfamily = "sans",

    # output features
    height = 1200,
    width = 1200,
    resolution = 300,
    compression = "lzw",
    filename = output,
    output = TRUE,
    disable.logging = TRUE
  )
}

All genes - kits per filter

Gene counts were filtered on a minimum (raw) count of 10 for at least 1 sample before checking for presence/absence.

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_centpip$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Centpip",
  NULL
  # here("results-refseq/figures/venn-centpip.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_plasmo$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Plasmo",
  NULL
  # here("results-refseq/figures/venn-plasmo.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_pmacs$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "PMACS",
  NULL
  # here("results-refseq/figures/venn-pmacs.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_cellulose$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Cellulose",
  NULL
  # here("results-refseq/figures/venn-cellulose.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_nofilter$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Cellulose",
  NULL
  # here("results-refseq/figures/venn-nofilter.png")
)
grid.newpage()
grid.draw(v)

P. knowlesi genes - kits per filter

Gene counts were filtered on a minimum (raw) count of 10 for at least 1 sample before checking for presence/absence.

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_centpip$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Centpip",
  NULL
  # here("results-refseq/figures/venn-centpip_pk.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_plasmo$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Plasmo",
  NULL
  # here("results-refseq/figures/venn-plasmo_pk.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_pmacs$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "PMACS",
  NULL
  # here("results-refseq/figures/venn-pmacs_pk.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_cellulose$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Cellulose",
  NULL
  # here("results-refseq/figures/venn-cellulose_pk.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_nofilter$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "No filter",
  NULL
  # here("results-refseq/figures/venn-nofilter_pk.png")
)
grid.newpage()
grid.draw(v)

All genes - filters per kit

# Use upset plots instead of five-way venn diagram (alternatively check out proportional area venn)

# mRNA Qiagen
UpSetR::upset(
  as.data.frame(gene_tpm_filtered_binary %>%
    select((comparison_mrna_qiagen %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-001_S1_L001",
      "cent/pip" = "104974-001-002_S1_L001",
      "plasmodipur" = "104974-001-003_S1_L001",
      "PMACS" = "104974-001-004_S1_L001",
      "cellulose" = "104974-001-005_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for mRNA Qiagen",
  order.by = "freq"
)

# tRNA Qiagen
UpSetR::upset(
  as.data.frame(gene_tpm_filtered_binary %>%
    select((comparison_trna_qiagen %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-007_S1_L001",
      "cent/pip" = "104974-001-008_S1_L001",
      "plasmodipur" = "104974-001-009_S1_L001",
      "PMACS" = "104974-001-010_S1_L001",
      "cellulose" = "104974-001-011_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for tRNA Qiagen",
  order.by = "freq"
)

# mRNA Illumina
UpSetR::upset(
  as.data.frame(gene_tpm_filtered_binary %>%
    select((comparison_mrna_illumina %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-012_S1_L001",
      "cent/pip" = "104974-001-013_S1_L001",
      "plasmodipur" = "104974-001-014_S1_L001",
      "PMACS" = "104974-001-015_S1_L001",
      "cellulose" = "104974-001-016_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for mRNA Illumina",
  order.by = "freq"
)

Pk genes - filters per kit

# mRNA Qiagen
UpSetR::upset(
  as.data.frame(gene_tpm_pk_filtered_binary %>%
    select((comparison_mrna_qiagen %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-001_S1_L001",
      "cent/pip" = "104974-001-002_S1_L001",
      "plasmodipur" = "104974-001-003_S1_L001",
      "PMACS" = "104974-001-004_S1_L001",
      "cellulose" = "104974-001-005_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for mRNA Qiagen",
  order.by = "freq"
)

# tRNA Qiagen
UpSetR::upset(
  as.data.frame(gene_tpm_pk_filtered_binary %>%
    select((comparison_trna_qiagen %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-007_S1_L001",
      "cent/pip" = "104974-001-008_S1_L001",
      "plasmodipur" = "104974-001-009_S1_L001",
      "PMACS" = "104974-001-010_S1_L001",
      "cellulose" = "104974-001-011_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for tRNA Qiagen",
  order.by = "freq"
)

# mRNA Illumina
UpSetR::upset(
  as.data.frame(gene_tpm_pk_filtered_binary %>%
    select((comparison_mrna_illumina %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-012_S1_L001",
      "cent/pip" = "104974-001-013_S1_L001",
      "plasmodipur" = "104974-001-014_S1_L001",
      "PMACS" = "104974-001-015_S1_L001",
      "cellulose" = "104974-001-016_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for mRNA Illumina",
  order.by = "freq"
)

Sample-to-sample distances

Which samples are most similar to each other in terms of gene expression profile? Use Euclidean distance between samples on rlog transformed gene counts (filtered to genes with a minimum of 10 reads per sample).

heatmap_of_distances <- function(transformed_data, title) {
  
  sampleDists <- dist(t(assay(transformed_data)))
  sampleDistMatrix <- as.matrix(sampleDists)
  rownames(sampleDistMatrix) <- transformed_data$sample_short
  colnames(sampleDistMatrix) <- NULL
  
    
  # add grouping columns
  anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
    mutate(kit_name = as.factor(kit_name)) %>%
    mutate(filter_name = as.factor(filter_name)) %>%
    rename_with(~ str_to_title(str_replace(.x, "_", " ")))
  
  # set names of samples
  colnames(sampleDistMatrix) <- (transformed_data)$sample_short
  rownames(anno) <- (transformed_data)$sample_short
  
  # Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
  mat_colors <- list(
    `Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
    `Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
  )
  names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
  names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
  
  # create heatmap
  p <- pheatmap(sampleDistMatrix,
    annotation_col = anno,
    annotation_colors = mat_colors,
    clustering_distance_rows = sampleDists,
    clustering_distance_cols = sampleDists,
    color = viridis(
      n = 256, alpha = 1,
      begin = 0, end = 1, option = "viridis"
    ),
    # main = "Sample-to-sample Euclidian distance based on all genes",
    main = title,
    scale = "none",
    show_rownames = TRUE,
    show_colnames=FALSE
  ) +
    theme(plot.title = element_markdown())
  
  return(p)
}

All genes (rlog)

heatmap_of_distances(rld_filtered, str_wrap("Sample-to-sample Euclidian distance based on all genes - rlog", 40))

## NULL

All genes (vst)

The VST transform leads to slightly different results: the two large blocks remains the same, but the positioning of samples 5, 11 and 16 differs. Overall, since the samples are so similar overall, slight deviations in the clustering of the distance matrix are not too surprising.

heatmap_of_distances(vsd_filtered, str_wrap("Sample-to-sample Euclidian distance based on all genes - vst", 40))

## NULL

P. knowlesi only (rlog)

heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - rlog before subset", 40))

## NULL

P. knowlesi only (vst)

heatmap_of_distances(vsd_filtered[rowData(vsd_filtered)$species == "pk", ], str_wrap("Sample-to-sample Euclidian distance based on P. knowlesi genes - vst before subset", 40))

## NULL

P. knowlesi only (rlog) - transform after subset

heatmap_of_distances(rld_pk_filtered, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - rlog after subset", 40))

## NULL

P. knowlesi only (vst) - transform after subset

heatmap_of_distances(vsd_pk_filtered, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - vst after subset", 40))

## NULL

P. knowlesi protein-coding only (rlog)

heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi** protein-coding genes", 40))

## NULL

P. knowlesi protein-coding only (vst)

heatmap_of_distances(vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes", 40))

## NULL

P. knowlesi protein-coding only (rlog) - transform after subset

heatmap_of_distances(rld_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - rlog after subset", 40))

## NULL

P. knowlesi protein-coding only (vst) - transform after subset

heatmap_of_distances(vsd_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - vst after subset", 40))

## NULL

Heatmap of count matrix

heatmap_of_counts <- function(raw_data, transformed_data, top_type = "mean", top_n = 20, center = TRUE) {
  # select n most highly expressed/variable genes
  if (top_type == "mean") {
    select <- order(rowMeans(counts(raw_data, normalized = FALSE)),
                    decreasing = TRUE
                    )[1:top_n]
  } else if (top_type == "median") {
    select <- order(rowMedians(counts(raw_data, normalized = FALSE)),
                    decreasing = TRUE
                    )[1:top_n]
  } else if (top_type == "var") {
    select <- head(order(rowVars(assay(transformed_data)), decreasing = TRUE), top_n)
  }
  
  # select transformed data and center if requested
  mat <- assay(transformed_data)[select, ]
  
  if (center == TRUE){
    mat <- mat - rowMeans(mat)
  }
  
  # add grouping columns
  anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
  mutate(kit_name = as.factor(kit_name)) %>%
  mutate(filter_name = as.factor(filter_name)) %>%
  rename_with(~ str_to_title(str_replace(.x, "_", " ")))
  
  # set names of samples
  colnames(mat) <- (transformed_data)$sample_short
  rownames(anno) <- (transformed_data)$sample_short
  
  # Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
  mat_colors <- list(
    `Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
    `Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
  )
  names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
  names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)

  # create heatmap
  pheatmap(mat,
    annotation_col = anno,
    annotation_colors = mat_colors,
    color = viridis(n = 256, alpha = 1, begin = 0, end = 1, option = "viridis"),
    # cluster_rows = FALSE,
    # cluster_cols = FALSE,
    show_rownames = FALSE,
  )
  
  # create table of genes
  gene_annotation %>% filter(gene_id %in% rownames(mat))
}

All genes - vst - top expressed (mean)

This plot clusters the top n=20 most highly expressed genes (mean) centred on the gene’s mean expression level across all samples using the VST data.

Several of the top expressed genes are globin and rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, vsd_filtered, "mean", 20, TRUE)

All genes - rlog - top expressed (mean)

This plot cluster the top n=20 most highly expressed genes (mean) centred on the gene’s mean expression level across all samples using the rlog transformed data.

Several of the top expressed genes are globin and rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, rld_filtered, "mean", 20, TRUE)

All genes - vst - top expressed (median)

This plot clusters the top n=20 most highly expressed genes (median) centred on the gene’s mean expression level across all samples using the VST data.

Several of the top expressed genes are globin and rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, vsd_filtered, "median", 20, TRUE)

All genes - rlog - top expressed (median)

This plot cluster the top n=20 most highly expressed genes (median) centred on the gene’s mean expression level across all samples using the rlog transformed data.

Several of the top expressed genes are globin and rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, rld_filtered, "median", 20, TRUE)

All genes - vst - top variable

This plot clusters the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the VST data.

Several of the top variable genes are human/Pk rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, vsd_filtered, "var", 20, TRUE)

All genes - rlog - top variable

This plot cluster the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the rlog transformed data.

Several of the top variable genes are human genes (but globin and rRNA are rare).

heatmap_of_counts(gse_filtered, rld_filtered, "var", 20, TRUE)

Pk genes protein coding - vst - top expressed (mean)

This plot clusters the top n=20 most highly expressed genes centred on the gene’s mean expression level across all samples using the VST data.

The choice of transformation only has a minor effect on this subset of top-expressed genes.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"], 
                  "mean", 20, TRUE)

Pk genes protein coding - rlog - top expressed (mean)

This plot cluster the top n=20 most highly expressed genes centred on the gene’s mean expression level across all samples using the rlog transformed data.

The choice of transformation only has a minor effect on this subset of top-expressed genes.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], 
                  "mean", 20, TRUE)

Pk genes protein coding - vst - top variable

This plot clusters the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the VST data.

The choice of transformation only has a minor effect on this subset of top-variable genes.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"], 
                  "var", 20, TRUE)

Pk genes protein coding - rlog - top variable

This plot cluster the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the rlog transformed data.

The choice of transformation only has a minor effect on this subset of top-variable genes.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], 
                  "var", 20, TRUE)

Pk genes protein coding - vst - top expressed genes (mean) - transform after subsetting

This plot cluster the top n=20 most highly expressed Pk protein coding genes (centred on the gene’s mean expression level) across all samples using the vst transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered_coding,
                  vsd_pk_filtered_coding, 
                  "mean", 20, TRUE)

Pk genes protein coding - rlog - top expressed genes (mean) - transform after subsetting

This plot cluster the top n=20 most highly expressed Pk protein coding genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered_coding,
                  rld_pk_filtered_coding, 
                  "mean", 20, TRUE)

Pk genes protein coding - vst - top variable - transform after subsetting

This plot cluster the top n=20 most highly variable Pk protein coding genes across all samples using the vst transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered_coding,
                  vsd_pk_filtered_coding, 
                  "var", 20, TRUE)

Pk genes protein coding - rlog - top variable - transform after subsetting

This plot cluster the top n=20 most highly variable Pk protein coding genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered_coding,
                  rld_pk_filtered_coding, 
                  "var", 20, TRUE)

Pk genes - vst - top expressed genes

This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering.

Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"],
                  "mean", 20, TRUE)

Pk genes - rlog - top expressed genes

This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"], 
                  "mean", 20, TRUE)

Pk genes - vst - top variable

This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"],
                  "var", 20, TRUE)

Pk genes - rlog - top variable

This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"], 
                  "var", 20, TRUE)

Pk genes - vst - top expressed genes - transform after subsetting

This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.

heatmap_of_counts(gse_pk_filtered,
                  vsd_pk_filtered,
                  "mean", 20, TRUE)

Pk genes - rlog - top expressed genes - transform after subsetting

This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.

heatmap_of_counts(gse_pk_filtered,
                  rld_pk_filtered, 
                  "mean", 20, TRUE)

Pk genes - vst - top variable - transform after subsetting

This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered,
                  vsd_pk_filtered,
                  "var", 20, TRUE)

Pk genes - rlog - top variable - transform after subsetting

This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered,
                  rld_pk_filtered, 
                  "var", 20, TRUE)

Expression count correlation plots

Protein-coding Pk genes - TPM

The following figure plots the (log-transformed) TPM of each protein-coding P. knowlesi gene (filtered on a minimum count of 10 for at least one sample) for all pairwise combinations of samples.

mat <- round(
          cor(
            gene_tpm_filtered %>%
              filter(species == "pk") %>%
              filter(gene_biotype == "protein_coding") %>% 
              select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>% 
              rename_with(~samples %>% pull(sample_name) %>% as.character(), everything())
            ),
          1)
corr <- mat

ggcorrplot(corr,
           type = "lower",
           method = "circle",
           hc.order = FALSE,
           outline.color = "white",
           p.mat = cor_pmat(mat),
           lab = TRUE,
           legend.title = "Pearson correlation of TPM values") +
  scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1),
                       name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

correlation_plot <- ggpairs(
  gene_tpm_filtered %>%
    filter(species == "pk") %>%
    filter(gene_biotype == "protein_coding"),
  columns = 3:18,
  columnLabels = samples$sample_short,
  upper = list(continuous = wrap("cor", size = 5)),
) +
  theme(
    axis.text = element_text(size = 9),
    axis.title = element_text(size = 9),
    strip.text.x = element_text(size = 11),
    strip.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 9)
    )
  # scale_y_discrete()

correlation_plot

correlation_plot_log <- ggpairs(
  gene_tpm_filtered %>%
    filter(species == "pk") %>%
    filter(gene_biotype == "protein_coding"),
  columns = 3:18, # `104974-001-001_S1_L001`:`104974-001-002_S1_L001`,
  lower = list(continuous = "smooth"),
  upper = list(continuous = wrap("cor", size = 5)),
  columnLabels = samples$sample_short
) +
  theme(
    axis.text = element_text(size = 9),
    axis.title = element_text(size = 9),
    strip.text.x = element_text(size = 11),
    strip.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 9)
    ) +
  scale_x_continuous(trans = "log1p") +
  scale_y_continuous(trans = "log1p")

correlation_plot_log

PCA plots

All PCA plots use the top 500 most variable genes and are filtered to only genes with a minimum count of 10 reads for at least one sample, unless otherwise stated.

All genes (rlog)

pcaData <- plotPCA(rld_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of human and ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Filter method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

All genes (vst)

pcaData <- plotPCA(vsd_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of human and ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Filter method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes (rlog)

pcaData <- plotPCA(rld_filtered[rowData(rld_filtered)$species == "pk", ], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes (vst)

pcaData <- plotPCA(vsd_filtered[rowData(vsd_filtered)$species == "pk", ], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (rlog)

pcaData <- plotPCA(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (vst)

pcaData <- plotPCA(vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes (rlog) - transform after subset

pcaData <- plotPCA(rld_pk_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes (vst) - transform after subset

pcaData <- plotPCA(vsd_pk_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (rlog)

protein_gene_filter <- intersect(rownames(rowData(rld_filtered)), gene_annotation %>% filter(gene_biotype == "protein_coding" & species == "pk") %>% select(gene_id) %>% pull())
pcaData <- plotPCA(rld_filtered[protein_gene_filter, ], intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (vst)

protein_gene_filter <- intersect(rownames(rowData(vsd_filtered)), gene_annotation %>% filter(gene_biotype == "protein_coding" & species == "pk") %>% select(gene_id) %>% pull())
pcaData <- plotPCA(vsd_filtered[protein_gene_filter, ], intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (rlog) - transform after subset

pcaData <- plotPCA(rld_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog counts of ", italic("P. knowlesi"), " protein coding genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (vst) - transform after subset

pcaData <- plotPCA(vsd_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " protein coding genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Figures for publication

Sum of gene counts

Ordered by library/depletion and filter

samples_order <- samples %>% 
  arrange(factor(samples$kit, levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
          factor(samples$WBC_depletion, levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"))) %>% 
  select(sample_name)

p2_a <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>% 
  # arrange(factor(sample_name, levels = samples_order %>% pull())) %>% 
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = species)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    # labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = "Sum of all gene counts", fill = "RNA type") +
  theme_classic() +
  ggtitle("A")

# print p2a to show y labels
p2_a

p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())

p2_b <- gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_manual(
    values = c(`human` = "#440154FF", 
               `human_globin` = "#3B528BFF", 
               `human_rRNA` = "#21908CFF",
               `pk_rRNA` = "#5DC863FF",
               `pk` = "#FDE725FF"),
      labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")),
                 expression(italic("P. knowlesi rRNA")))
    ) +
  # scale_fill_viridis(
  #   option = "viridis", discrete = T,
  #   # limits = rev,
  #   labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")),
  #              expression(italic("P. knowlesi rRNA")))
  # ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = "Sum of TPM counts", fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("B")

p2_c <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  filter(species == "pk" & gene_biotype == "protein_coding") %>%
  group_by(sample_name,) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name)) +
  geom_bar(position = "stack", stat = "identity", fill = "#FDE725FF") +
  # scale_fill_manual(viridis::viridis(5)[4:5]) +
  # scale_fill_manual(
  #   values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
  #   # scale_fill_viridis(
  #   #   option = "viridis", discrete = T,
  #   #   # limits = rev,
  #   labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  # ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = str_wrap("Sum of *P. knowlesi* protein-coding gene counts", 10), fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.title.x = ggtext::element_markdown()) +
  ggtitle("C")

fig <- ggarrange(
  p2_a, p2_b, p2_c,
  ncol = 3,
  legend = "bottom",
  common.legend = TRUE,
  legend.grob = get_legend(p2_b)
  # widths = c(2,1,2)
)
fig

ggsave(filename = here("results-refseq/figures/gene_counts.svg"), 
      plot = fig, width=10, height=6)

Ordered by sample number

p2_a <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of gene counts", fill = "RNA type") +
  theme_classic() +
  ggtitle("A")

# print p2a to show y labels
p2_a

p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())


p2_b <- gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("B")

p2_c <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  filter(species == "pk") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  # scale_fill_manual(viridis::viridis(5)[4:5]) +
  scale_fill_manual(
    values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
    # scale_fill_viridis(
    #   option = "viridis", discrete = T,
    #   # limits = rev,
    labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = expression(paste("Sum of ", italic("P. knowlesi"), " gene counts")), fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("C")

fig <- ggarrange(
  p2_a, p2_b, p2_c,
  ncol = 3,
  common.legend = TRUE, legend = "bottom"
)
fig

Ordered by library - depletion - filter

samples_order <- samples %>% 
  arrange(factor(samples$kit, levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
          factor(samples$WBC_depletion, levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"))) %>% 
  select(sample_name)

p2_a <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>% 
  # arrange(factor(sample_name, levels = samples_order %>% pull())) %>% 
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = "Sum of gene counts", fill = "RNA type") +
  theme_classic() +
  ggtitle("A")

# print p2a to show y labels
p2_a

p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())

p2_b <- gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = "Sum of TPM counts", fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("B")

p2_c <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  filter(species == "pk") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  # scale_fill_manual(viridis::viridis(5)[4:5]) +
  scale_fill_manual(
    values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
    # scale_fill_viridis(
    #   option = "viridis", discrete = T,
    #   # limits = rev,
    labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = expression(paste("Sum of ", italic("P. knowlesi"), " gene counts")), fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("C")

fig <- ggarrange(
  p2_a, p2_b, p2_c,
  ncol = 3,
  common.legend = TRUE, legend = "bottom"
)
fig

Heatmaps of distances

heatmap_of_distances <- function(transformed_data, title) {
  
  sampleDists <- dist(t(assay(transformed_data)))
  sampleDistMatrix <- as.matrix(sampleDists)
  rownames(sampleDistMatrix) <- transformed_data$sample_short
  colnames(sampleDistMatrix) <- NULL
  
    
  # add grouping columns
  anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
    mutate(kit_name = as.factor(kit_name)) %>%
    mutate(filter_name = as.factor(filter_name)) %>%
    rename_with(~ str_to_title(str_replace(.x, "_", " ")))
  
  # set names of samples
  colnames(sampleDistMatrix) <- (transformed_data)$sample_short
  rownames(anno) <- (transformed_data)$sample_short
  
  # Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
  mat_colors <- list(
    `Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
    `Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
  )
  names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
  names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
  
  # create heatmap
  p <- pheatmap(sampleDistMatrix,
    annotation_col = anno,
    annotation_colors = mat_colors,
    clustering_distance_rows = sampleDists,
    clustering_distance_cols = sampleDists,
    color = viridis(
      n = 256, alpha = 1,
      begin = 0, end = 1, option = "viridis"
    ),
    # main = "Sample-to-sample Euclidian distance based on all genes",
    # main = title,
    scale = "none",
    show_rownames = TRUE,
    show_colnames=FALSE
  ) 
  
  return(p)
}

p <- heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi** protein-coding genes", 40))
ggsave(filename = here("results-refseq/figures/genes_pk_coding_sample_dist_rlog.svg"), 
      plot = p)
## Saving 7 x 5 in image
p

p <- heatmap_of_distances(rld_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - rlog after subset", 40))
ggsave(filename = here("results-refseq/figures/genes_pk_coding_sample_dist_rlog_transform_after_subset.svg"), 
      plot = p)
## Saving 7 x 5 in image
p

Heatmaps of count matrix

heatmap_of_counts <- function(raw_data, transformed_data, top_type = "mean", top_n = 20, center = TRUE) {
  # select n most highly expressed/variable genes
  if (top_type == "mean") {
    select <- order(rowMeans(counts(raw_data, normalized = FALSE)),
                    decreasing = TRUE
                    )[1:top_n]
  } else if (top_type == "median") {
    select <- order(rowMedians(counts(raw_data, normalized = FALSE)),
                    decreasing = TRUE
                    )[1:top_n]
  } else if (top_type == "var") {
    select <- head(order(rowVars(assay(transformed_data)), decreasing = TRUE), top_n)
  }
  
  # select transformed data and center if requested
  mat <- assay(transformed_data)[select, ]
  
  if (center == TRUE){
    mat <- mat - rowMeans(mat)
  }
  
  # add grouping columns
  anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
  mutate(kit_name = as.factor(kit_name)) %>%
  mutate(filter_name = as.factor(filter_name)) %>%
  rename_with(~ str_to_title(str_replace(.x, "_", " ")))
  
  # set names of samples
  colnames(mat) <- (transformed_data)$sample_short
  rownames(anno) <- (transformed_data)$sample_short
  
  # Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
  mat_colors <- list(
    `Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
    `Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
  )
  names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
  names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)

  # create heatmap
  pheatmap(mat,
    annotation_col = anno,
    annotation_colors = mat_colors,
    color = viridis(n = 256, alpha = 1, begin = 0, end = 1, option = "viridis"),
    # cluster_rows = FALSE,
    # cluster_cols = FALSE,
    show_rownames = FALSE,
  )
}

# p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
#                   rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], 
#                   "mean", 20, TRUE)
# ggsave(filename = here("results-refseq/figures/genes_pk_coding_top_exp_rlog.svg"), 
#       plot = p, width=10, height=8)

p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], 
                  "var", 20, TRUE)
ggsave(filename = here("results-refseq/figures/genes_pk_coding_top_var_rlog.svg"), 
      plot = p)
## Saving 7 x 5 in image
p

# p <- heatmap_of_counts(gse_pk_filtered_coding,
#                   rld_pk_filtered_coding, 
#                   "mean", 20, TRUE)
# ggsave(filename = here("results-refseq/figures/genes_pk_coding_top_exp_rlog_transform_after_subset.svg"), 
#       plot = p, width=10, height=8)

p <- heatmap_of_counts(gse_pk_filtered_coding,
                  rld_pk_filtered_coding, 
                  "var", 20, TRUE)
ggsave(filename = here("results-refseq/figures/genes_pk_coding_top_var_rlog_transform_after_subset.svg"), 
      plot = p)
## Saving 7 x 5 in image
p

# p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
#                   vsd_filtered[rowData(gse_filtered)$species == "pk"], 
#                   "mean", 20, TRUE)
# ggsave(filename = here("results-refseq/figures/genes_pk_all_top_exp_rlog.svg"), 
#       plot = p, width=10, height=8)

p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"], 
                  "var", 20, TRUE)
ggsave(filename = here("results-refseq/figures/genes_pk_all_top_var_rlog.svg"), 
      plot = p)
## Saving 7 x 5 in image
p

# p <- heatmap_of_counts(gse_pk_filtered,
#                   rld_pk_filtered, 
#                   "mean", 20, TRUE)
# ggsave(filename = here("results-refseq/figures/genes_pk_all_top_exp_rlog_transform_after_subset.svg"), 
#       plot = p, width=10, height=8)

p <- heatmap_of_counts(gse_pk_filtered,
                  rld_pk_filtered, 
                  "var", 20, TRUE)
ggsave(filename = here("results-refseq/figures/genes_pk_all_top_var_rlog_transform_after_subset.svg"), 
      plot = p)
## Saving 7 x 5 in image
p

PCA plots

Pk genes protein-coding (rlog) - transform after subset

pcaData <- plotPCA(rld_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
p <- ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog counts of ", italic("P. knowlesi"), " protein coding genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

ggsave(filename = here("results-refseq/figures/pca_pk_coding_rlog_transform_after_subset.svg"), 
      plot = p)
## Saving 7 x 5 in image
p

Gene expression count correlation

mat <- round(
          cor(
            gene_tpm_filtered %>%
              filter(species == "pk") %>%
              filter(gene_biotype == "protein_coding") %>% 
              select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>% 
              rename_with(~samples %>% pull(sample_name) %>% as.character(), everything())
            ),
          1)
corr <- mat

p <- ggcorrplot(corr,
           type = "lower",
           method = "circle",
           hc.order = FALSE,
           outline.color = "white",
           p.mat = cor_pmat(mat),
           lab = TRUE,
           legend.title = "Pearson correlation of TPM values") +
  scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1), name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave(filename = here("results-refseq/figures/pk_coding_tpm_correlation.svg"), 
      plot = p,
      width = 10, height = 10)
p

# ordered by mRNA - tRNA

mat <- round(
          cor(
            gene_tpm_filtered %>%
              filter(species == "pk") %>%
              filter(gene_biotype == "protein_coding") %>% 
              select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>% 
              rename_with(~samples %>% pull(sample_name) %>% as.character(), everything()) %>% 
              relocate(samples_order %>% pull())
            ),
          1)
corr <- mat

p <- ggcorrplot(corr,
           type = "lower",
           method = "circle",
           hc.order = FALSE,
           outline.color = "white",
           p.mat = cor_pmat(mat),
           lab = TRUE,
           legend.title = "Pearson correlation of TPM values") +
  scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1), name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave(filename = here("results-refseq/figures/pk_coding_tpm_correlation_ordered.svg"), 
      plot = p,
      width = 10, height = 10)
p

correlation_plot

ggsave(filename = here("results-refseq/figures/correlation_protein_coding_genes_pk.png"), 
      plot = correlation_plot,
      height = 4000, width = 6000, units = "px")
ggsave(filename = here("results-refseq/figures/correlation_protein_coding_genes_pk.svg"),
      plot = correlation_plot,
      height = 4000, width = 6000, units = "px")

# order by tRNA - mRNA

samples_order <- samples %>% 
  arrange(factor(samples$kit,
                 levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
          factor(samples$WBC_depletion,
                 levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose")))

correlation_plot_ordered <- ggpairs(
  gene_tpm_filtered %>%
    filter(species == "pk") %>%
    filter(gene_biotype == "protein_coding") %>% 
    relocate(
      samples_order %>% select(sample) %>% pull()
      ),
  columns = 1:16,
  # columnLabels = samples_order %>% select(sample_short) %>% pull(),
  columnLabels = samples_order %>% select(sample_short) %>% mutate(sample_short = factor(sample_short, levels = sample_short)) %>% pull(),
  upper = list(continuous = wrap("cor", size = 5)),
) +
  theme(
    axis.text = element_text(size = 9),
    axis.title = element_text(size = 9),
    strip.text.x = element_text(size = 11),
    strip.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 9)
    )
  # scale_y_discrete()

correlation_plot_ordered

ggsave(filename = here("results-refseq/figures/correlation_protein_coding_genes_pk_ordered.png"), 
      plot = correlation_plot_ordered,
      height = 4000, width = 6000, units = "px")
ggsave(filename = here("results-refseq/figures/correlation_protein_coding_genes_pk_ordered.svg"),
      plot = correlation_plot_ordered,
      height = 4000, width = 6000, units = "px")
correlation_plot_log

ggsave(here("results-refseq/figures/correlation_protein_coding_genes_logscaled.png"), 
       plot = correlation_plot_log,
       height = 4000, width = 6000, units = "px")
ggsave(here("results-refseq/figures/correlation_protein_coding_genes_logscaled.svg"), 
       plot = correlation_plot_log,
       height = 4000, width = 6000, units = "px")

# order by tRNA-mRNA

correlation_plot_log_ordered <- ggpairs(
  gene_tpm_filtered %>%
    filter(species == "pk") %>%
    filter(gene_biotype == "protein_coding") %>% 
    relocate(
      samples_order %>% select(sample) %>% pull()
    ),
  columns = 1:16, # `104974-001-001_S1_L001`:`104974-001-002_S1_L001`,
  lower = list(continuous = "smooth"),
  upper = list(continuous = wrap("cor", size = 5)),
  columnLabels = samples_order %>% select(sample_short) %>% mutate(sample_short = factor(sample_short, levels = sample_short)) %>% pull(),,
) +
  theme(
    axis.text = element_text(size = 9),
    axis.title = element_text(size = 9),
    strip.text.x = element_text(size = 11),
    strip.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 9)
    ) +
  scale_x_continuous(trans = "log1p") +
  scale_y_continuous(trans = "log1p")

correlation_plot_log

ggsave(here("results-refseq/figures/correlation_protein_coding_genes_logscaled_ordered.png"), 
       plot = correlation_plot_log_ordered,
       height = 4000, width = 6000, units = "px")
ggsave(here("results-refseq/figures/correlation_protein_coding_genes_logscaled_ordered.svg"), 
       plot = correlation_plot_log_ordered,
       height = 4000, width = 6000, units = "px")

Number of Pk genes in transcript_info file:

dim(transcript_info$gene %>% filter(grepl("PKNH", gene_id)))
## [1] 5393    2

Number of Pk genes in count matrices:

gene_tpm %>% filter(species == "pk")

Number of Pk genes in gff:

read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
  mutate(id = row_number()) %>%
  separate_rows(attribute, sep = ";") %>%
  separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
  pivot_wider(names_from = attribute, values_from = attribute_value) %>%
  filter(!is.na(gene_biotype))
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
  mutate(id = row_number()) %>%
  separate_rows(attribute, sep = ";") %>%
  separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
  pivot_wider(names_from = attribute, values_from = attribute_value) %>%
  filter(feature == "gene")
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
  mutate(id = row_number()) %>%
  separate_rows(attribute, sep = ";") %>%
  separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
  pivot_wider(names_from = attribute, values_from = attribute_value) %>%
  filter(gene_biotype == "protein_coding")
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Non-protein coding genes:

read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
  mutate(id = row_number()) %>%
  separate_rows(attribute, sep = ";") %>%
  separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
  pivot_wider(names_from = attribute, values_from = attribute_value) %>%
  filter(!gene_biotype == "protein_coding")
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
  mutate(id = row_number()) %>%
  separate_rows(attribute, sep = ";") %>%
  separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
  pivot_wider(names_from = attribute, values_from = attribute_value) %>%
  filter(gene_biotype == "rRNA")
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.